In [24]:
import torch
import numpy as np
import re
from transformers import AutoTokenizer, AutoModel, AutoConfig
from nltk import word_tokenize
import re
import tensorflow as tf
import nltk.corpus 
import nltk
from keras.utils import np_utils
from numpy import savez_compressed

Using TensorFlow backend.


Retrieving dataset (George Orweil's 1984)

In [14]:
nltk.download('mte_teip5')
data = nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal")

[nltk_data] Downloading package mte_teip5 to
[nltk_data]     /Users/Mahwish/nltk_data...
[nltk_data]   Package mte_teip5 is already up-to-date!


Retrieving saved variables from below queries

In [51]:
from numpy import load
# load dict of arrays + extract the first array
dict_data = load('BERT_labels_xx.npz')
xx = dict_data['arr_0'] # contextual embeddings tensor saved for range(0,400)
dict_data = load('BERT_labels_yy.npz')
yy = dict_data['arr_0'] # corresponding labels saved in list
dict_data = load('BERT_bubu2_yy.npz')
bubu2 = dict_data['arr_0'] # to check the text corresponding to label
dict_data = load('BERT_extra_set_list_xx.npz')
extra_set_list = dict_data['arr_0'] # to check the text corresponding to embedding
dict_data = load('BERT_dev_labels_yy.npz')
dev_labels = dict_data['arr_0'] # previously computed labels (extension of yy)

Checking transformer version is correct -- older version causes problems

In [9]:
#!pip install transformers --upgrade
#import transformers
#print(transformers.__version__)
#4.9.0.dev0 + required

Set seed

In [4]:
#for reproducibility
seed=20
cust_seed = np.random.seed(seed)

Checking how indexing works when splitting sentences

In [26]:
#Testing splitting methods
st = 'The cat sat on the mat'
st.split(" ").index('mat')

5

The below function creates chunks of words splitting the text after 40 tokens. Chose this based on average sentence length to ensure 'context' is kept to give an idea of syntactic structure of the sentence when retrieving a particular words embedding, but not too much context as to diminish the attention given to the particular tokens. This should help when POS tagging (a noun came before i'th token, punctuation after, etc).

In [15]:
def set_creator(chunk):
    text = []
    tag_counter = 0
    for idx3, k in enumerate(data):
        for idx4, m in enumerate(k):
            if chunk*40 <= tag_counter < (chunk+1)*40:
                text.append(m[0])
            tag_counter += 1
    text = ' '.join(text)
    input_string = text
    
    #Here I'm trying to experiment on how to pad the string so that the punctuation is separated when grabbing the embedding
    # if the string isn't padded, it'll try to find 'pumpkin.' instead of 'pumpkin', '.'
    pattern = '([:;.,!?()])'
    input_string = re.sub(pattern, r' \1 ', input_string)
    input_string = re.sub('\s\s*',' ',input_string)
    input_string = re.sub(" '","'",input_string)
    input_string = re.sub("s' ","s ' ",input_string)
    return input_string   
#print(set_creator())

The two functions below (from huggingface blog) used to retrieve the contextual embeddings of each token given a context string generated from the above. Comments added for info.

In [45]:
def get_hidden_states(encoded, token_ids_word, model, layers):
    """Push input IDs through model. Stack and sum 'layers' (last four by default).
       Select only those subword token outputs that belong to our word of interest
       and average them."""
    with torch.no_grad():
        output = model(**encoded)
        #print(output)
        
    # Gets all hidden states from model
    states = output.hidden_states
    # Stack and sum all requested layers (defined which ones later - using -4, -3, -2, -1)
    output = torch.stack([states[o] for o in layers]).sum(0).squeeze()
    
    print('output',np.shape(output))
    # Only select the tokens that constitute the requested word (since subtokens can be generated)
    word_tokens_output = output[token_ids_word]
    #print('word_tokens_output',word_tokens_output[0][0], np.shape(word_tokens_output))
    #print('avg subcomp word',word_tokens_output.mean(dim=0)[0],np.shape((word_tokens_output.mean(dim=0))))
    return word_tokens_output.mean(dim=0) # this averages the sub components of the word

def get_word_vector(sent, idx, tokenizer, model, layers):
    """Get a word vector by first tokenizing the input sentence, getting all token idxs
       that make up the word of interest, and then 'get_hidden_states'."""
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    print('encoded',encoded)
    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
    #print('token_ids_word',token_ids_word)
    print('encoded.word_ids-idx',(np.array(encoded.word_ids()) == idx))
    return get_hidden_states(encoded, token_ids_word, model, layers)

Example below printing sections of above code, and retrieving the contextual embeddings of word 'mat' from sentence 'The cat sat on the mat'.

In [13]:
layers = [-4, -3, -2, -1]
tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
config = AutoConfig.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos", output_hidden_states=True)
model = AutoModel.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos", config=config)

sent = "The cat sat on the mat" 
idx = sent.split(" ").index('mat')
word_embedding = get_word_vector(sent, idx, tokenizer, model, layers)

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


encoded {'input_ids': tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False  True False]
output torch.Size([8, 768])
word_tokens_output tensor(2.5789) torch.Size([1, 768])
avg subcomp word tensor(2.5789) torch.Size([768])


Below script retrieves embeddings for each token within a context chunk ( of length 40), and concatenates them into a contextual embedding tensor (tens). The tens for which results are beneath is for range(0,400) but this takes a while to run so I've changed to range (0,10) for demonstration purpose.

In [48]:
# Use last four layers by default
counter=0
layers = [-4, -3, -2, -1]
tokenizer = AutoTokenizer.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
config = AutoConfig.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos", output_hidden_states=True)
model = AutoModel.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos", output_hidden_states=True)
tens = torch.empty((0), dtype=torch.int32)
extra_set_list = []
for q in range(0,10):
    input_string = set_creator(q)
    ww = input_string.split(" ")
    print('q',q)

    for ix, i in enumerate(ww):
        print(ix,i)
        #print(list_labels[ix])
        if i != '':
            extra_set_list.append(i)
            idx = ix
            word_embedding = get_word_vector(input_string,idx,tokenizer,model,layers)
            
            if (ix == 0 & q == 0): # CHANGE THIS 
                tens = word_embedding[None,:]
                print(word_embedding)
            else:
                tens = torch.cat((tens, word_embedding[None,:]),0)
                print(ix,i, 'tens shape',np.shape(tens))
                print(word_embedding[0])
                print('counter',counter)
            
            counter+=1

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


q 0
0 It
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False]


output torch.Size([43, 768])
1 was tens shape torch.Size([2, 768])
tensor(0.5775)
counter 1
2 a
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False

output torch.Size([43, 768])
9 and tens shape torch.Size([10, 768])
tensor(-1.8758)
counter 9
10 the
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False False False False 

output torch.Size([43, 768])
17 Smith tens shape torch.Size([18, 768])
tensor(3.2208)
counter 17
18 ,
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False  True False False False False
 False False False False

output torch.Size([43, 768])
25 in tens shape torch.Size([26, 768])
tensor(-3.7581)
counter 25
26 an
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False 

output torch.Size([43, 768])
33 , tens shape torch.Size([34, 768])
tensor(0.6717)
counter 33
34 slipped
encoded {'input_ids': tensor([[  101,  2009,  2001,  1037,  4408,  3147,  2154,  1999,  2258,  1010,
          1998,  1996, 20940,  2020,  8478,  7093,  1012, 10180,  3044,  1010,
          2010,  5413, 16371, 17269,  2046,  2010,  7388,  1999,  2019,  3947,
          2000,  4019,  1996, 25047,  3612,  1010,  5707,  2855,  2083,  1996,
          3221,  4303,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False Fal

output torch.Size([44, 768])
1 Victory tens shape torch.Size([2, 768])
tensor(0.4362)
counter 41
2 Mansions
encoded {'input_ids': tensor([[  101,  1997,  3377, 26842,  1010,  2295,  2025,  2855,  2438,  2000,
          4652,  1037, 28693,  1997, 24842,  3723,  6497,  2013,  5738,  2247,
          2007,  2032,  1012,  1996,  6797, 15488, 20042,  1997, 17020, 28540,
          1998,  2214, 17768, 22281,  1012,  2012,  2028,  2203,  1997,  2009,
          1037, 11401, 13082,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False False False False False False False
 Fals

output torch.Size([44, 768])
9 prevent tens shape torch.Size([10, 768])
tensor(-2.5119)
counter 49
10 a
encoded {'input_ids': tensor([[  101,  1997,  3377, 26842,  1010,  2295,  2025,  2855,  2438,  2000,
          4652,  1037, 28693,  1997, 24842,  3723,  6497,  2013,  5738,  2247,
          2007,  2032,  1012,  1996,  6797, 15488, 20042,  1997, 17020, 28540,
          1998,  2214, 17768, 22281,  1012,  2012,  2028,  2203,  1997,  2009,
          1037, 11401, 13082,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False Fa

output torch.Size([44, 768])
17 along tens shape torch.Size([18, 768])
tensor(-0.9473)
counter 57
18 with
encoded {'input_ids': tensor([[  101,  1997,  3377, 26842,  1010,  2295,  2025,  2855,  2438,  2000,
          4652,  1037, 28693,  1997, 24842,  3723,  6497,  2013,  5738,  2247,
          2007,  2032,  1012,  1996,  6797, 15488, 20042,  1997, 17020, 28540,
          1998,  2214, 17768, 22281,  1012,  2012,  2028,  2203,  1997,  2009,
          1037, 11401, 13082,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False 

output torch.Size([44, 768])
25 boiled tens shape torch.Size([26, 768])
tensor(4.1905)
counter 65
26 cabbage
encoded {'input_ids': tensor([[  101,  1997,  3377, 26842,  1010,  2295,  2025,  2855,  2438,  2000,
          4652,  1037, 28693,  1997, 24842,  3723,  6497,  2013,  5738,  2247,
          2007,  2032,  1012,  1996,  6797, 15488, 20042,  1997, 17020, 28540,
          1998,  2214, 17768, 22281,  1012,  2012,  2028,  2203,  1997,  2009,
          1037, 11401, 13082,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fal

output torch.Size([44, 768])
33 one tens shape torch.Size([34, 768])
tensor(-3.1514)
counter 73
34 end
encoded {'input_ids': tensor([[  101,  1997,  3377, 26842,  1010,  2295,  2025,  2855,  2438,  2000,
          4652,  1037, 28693,  1997, 24842,  3723,  6497,  2013,  5738,  2247,
          2007,  2032,  1012,  1996,  6797, 15488, 20042,  1997, 17020, 28540,
          1998,  2214, 17768, 22281,  1012,  2012,  2028,  2203,  1997,  2009,
          1037, 11401, 13082,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False Fal

output torch.Size([45, 768])
2 too tens shape torch.Size([42, 768])
tensor(0.5539)
counter 81
3 large
encoded {'input_ids': tensor([[  101,  1010,  2205,  2312,  2005,  7169,  4653,  1010,  2018,  2042,
         26997,  2098,  2000,  1996,  2813,  1012,  2009,  8212,  3432,  2019,
          8216,  2227,  1010,  2062,  2084,  1037,  7924,  2898,  1024,  1996,
          2227,  1997,  1037,  2158,  1997,  2055,  5659,  1011,  2274,  1010,
          2007,  1037,  3082,  2304,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False  True False False False False False False False
 False False False False False False False False False False False Fals

output torch.Size([45, 768])
10 tacked tens shape torch.Size([50, 768])
tensor(-1.9800)
counter 89
11 to
encoded {'input_ids': tensor([[  101,  1010,  2205,  2312,  2005,  7169,  4653,  1010,  2018,  2042,
         26997,  2098,  2000,  1996,  2813,  1012,  2009,  8212,  3432,  2019,
          8216,  2227,  1010,  2062,  2084,  1037,  7924,  2898,  1024,  1996,
          2227,  1997,  1037,  2158,  1997,  2055,  5659,  1011,  2274,  1010,
          2007,  1037,  3082,  2304,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False  True False False False False False False False False False F

output torch.Size([45, 768])
18 an tens shape torch.Size([58, 768])
tensor(0.6239)
counter 97
19 enormous
encoded {'input_ids': tensor([[  101,  1010,  2205,  2312,  2005,  7169,  4653,  1010,  2018,  2042,
         26997,  2098,  2000,  1996,  2813,  1012,  2009,  8212,  3432,  2019,
          8216,  2227,  1010,  2062,  2084,  1037,  7924,  2898,  1024,  1996,
          2227,  1997,  1037,  2158,  1997,  2055,  5659,  1011,  2274,  1010,
          2007,  1037,  3082,  2304,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False  True False 

output torch.Size([45, 768])
26 wide tens shape torch.Size([66, 768])
tensor(0.0591)
counter 105
27 :
encoded {'input_ids': tensor([[  101,  1010,  2205,  2312,  2005,  7169,  4653,  1010,  2018,  2042,
         26997,  2098,  2000,  1996,  2813,  1012,  2009,  8212,  3432,  2019,
          8216,  2227,  1010,  2062,  2084,  1037,  7924,  2898,  1024,  1996,
          2227,  1997,  1037,  2158,  1997,  2055,  5659,  1011,  2274,  1010,
          2007,  1037,  3082,  2304,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False Fals

output torch.Size([45, 768])
34 about tens shape torch.Size([74, 768])
tensor(2.5633)
counter 113
35 forty-five
encoded {'input_ids': tensor([[  101,  1010,  2205,  2312,  2005,  7169,  4653,  1010,  2018,  2042,
         26997,  2098,  2000,  1996,  2813,  1012,  2009,  8212,  3432,  2019,
          8216,  2227,  1010,  2062,  2084,  1037,  7924,  2898,  1024,  1996,
          2227,  1997,  1037,  2158,  1997,  2055,  5659,  1011,  2274,  1010,
          2007,  1037,  3082,  2304,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False 

output torch.Size([45, 768])
1 and tens shape torch.Size([2, 768])
tensor(-3.8513)
counter 121
2 ruggedly
encoded {'input_ids': tensor([[  101,  9587, 19966, 15395,  1998, 17638,  2135,  8502,  2838,  1012,
         10180,  2081,  2005,  1996,  5108,  1012,  2009,  2001,  2053,  2224,
          2667,  1996,  6336,  1012,  2130,  2012,  1996,  2190,  1997,  2335,
          2009,  2001, 15839,  2551,  1010,  1998,  2012,  2556,  1996,  3751,
          2783,  2001,  3013,  2125,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False  True  True False False False False False
 False False False False False False False False False False False 

output torch.Size([45, 768])
9 the tens shape torch.Size([10, 768])
tensor(-2.4984)
counter 129
10 stairs
encoded {'input_ids': tensor([[  101,  9587, 19966, 15395,  1998, 17638,  2135,  8502,  2838,  1012,
         10180,  2081,  2005,  1996,  5108,  1012,  2009,  2001,  2053,  2224,
          2667,  1996,  6336,  1012,  2130,  2012,  1996,  2190,  1997,  2335,
          2009,  2001, 15839,  2551,  1010,  1998,  2012,  2556,  1996,  3751,
          2783,  2001,  3013,  2125,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False  True False False False False False False False False 

output torch.Size([45, 768])
17 the tens shape torch.Size([18, 768])
tensor(-2.5956)
counter 137
18 lift
encoded {'input_ids': tensor([[  101,  9587, 19966, 15395,  1998, 17638,  2135,  8502,  2838,  1012,
         10180,  2081,  2005,  1996,  5108,  1012,  2009,  2001,  2053,  2224,
          2667,  1996,  6336,  1012,  2130,  2012,  1996,  2190,  1997,  2335,
          2009,  2001, 15839,  2551,  1010,  1998,  2012,  2556,  1996,  3751,
          2783,  2001,  3013,  2125,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False  True F

output torch.Size([45, 768])
26 it tens shape torch.Size([27, 768])
tensor(2.7012)
counter 146
27 was
encoded {'input_ids': tensor([[  101,  9587, 19966, 15395,  1998, 17638,  2135,  8502,  2838,  1012,
         10180,  2081,  2005,  1996,  5108,  1012,  2009,  2001,  2053,  2224,
          2667,  1996,  6336,  1012,  2130,  2012,  1996,  2190,  1997,  2335,
          2009,  2001, 15839,  2551,  1010,  1998,  2012,  2556,  1996,  3751,
          2783,  2001,  3013,  2125,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False Fals

output torch.Size([45, 768])
34 the tens shape torch.Size([35, 768])
tensor(-4.9372)
counter 154
35 electric
encoded {'input_ids': tensor([[  101,  9587, 19966, 15395,  1998, 17638,  2135,  8502,  2838,  1012,
         10180,  2081,  2005,  1996,  5108,  1012,  2009,  2001,  2053,  2224,
          2667,  1996,  6336,  1012,  2130,  2012,  1996,  2190,  1997,  2335,
          2009,  2001, 15839,  2551,  1010,  1998,  2012,  2556,  1996,  3751,
          2783,  2001,  3013,  2125,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False Fal

output torch.Size([47, 768])
1 daylight tens shape torch.Size([2, 768])
tensor(0.2879)
counter 161
2 hours
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False Fal

output torch.Size([47, 768])
9 economy tens shape torch.Size([10, 768])
tensor(-0.0948)
counter 169
10 drive
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
 False False False False False False F

output torch.Size([47, 768])
16 . tens shape torch.Size([17, 768])
tensor(0.2090)
counter 176
17 The
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False  True Fal

output torch.Size([47, 768])
23 , tens shape torch.Size([24, 768])
tensor(0.6847)
counter 183
24 and
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False Fal

output torch.Size([47, 768])
30 and tens shape torch.Size([31, 768])
tensor(0.6834)
counter 190
31 had
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False F

output torch.Size([47, 768])
37 right tens shape torch.Size([38, 768])
tensor(-0.6562)
counter 197
38 ankle
encoded {'input_ids': tensor([[  101,  2076, 11695,  2847,  1012,  2009,  2001,  2112,  1997,  1996,
          4610,  3298,  1999,  7547,  2005,  5223,  2733,  1012,  1996,  4257,
          2001,  2698,  7599,  2039,  1010,  1998, 10180,  1010,  2040,  2001,
          4228,  1011,  3157,  1998,  2018,  1037, 13075, 11261,  3366, 17359,
         17119,  2682,  2010,  2157, 10792,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False Fa

output torch.Size([46, 768])
1 slowly tens shape torch.Size([2, 768])
tensor(-0.7769)
counter 201
2 ,
encoded {'input_ids': tensor([[  101,  2253,  3254,  1010,  8345,  2195,  2335,  2006,  1996,  2126,
          1012,  2006,  2169,  4899,  1010,  4500,  1996,  6336,  1011,  9093,
          1010,  1996, 13082,  2007,  1996,  8216,  2227, 11114,  2013,  1996,
          2813,  1012,  2009,  2001,  2028,  1997,  2216,  4620,  2029,  2024,
          2061,  9530, 18886,  7178,  2008,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False False False False Fal

output torch.Size([46, 768])
9 . tens shape torch.Size([10, 768])
tensor(0.3038)
counter 209
10 On
encoded {'input_ids': tensor([[  101,  2253,  3254,  1010,  8345,  2195,  2335,  2006,  1996,  2126,
          1012,  2006,  2169,  4899,  1010,  4500,  1996,  6336,  1011,  9093,
          1010,  1996, 13082,  2007,  1996,  8216,  2227, 11114,  2013,  1996,
          2813,  1012,  2009,  2001,  2028,  1997,  2216,  4620,  2029,  2024,
          2061,  9530, 18886,  7178,  2008,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
 False False False False False False False False False False 

output torch.Size([46, 768])
17 , tens shape torch.Size([18, 768])
tensor(0.4407)
counter 217
18 the
encoded {'input_ids': tensor([[  101,  2253,  3254,  1010,  8345,  2195,  2335,  2006,  1996,  2126,
          1012,  2006,  2169,  4899,  1010,  4500,  1996,  6336,  1011,  9093,
          1010,  1996, 13082,  2007,  1996,  8216,  2227, 11114,  2013,  1996,
          2813,  1012,  2009,  2001,  2028,  1997,  2216,  4620,  2029,  2024,
          2061,  9530, 18886,  7178,  2008,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False  True False Fals

output torch.Size([46, 768])
25 from tens shape torch.Size([26, 768])
tensor(4.1014)
counter 225
26 the
encoded {'input_ids': tensor([[  101,  2253,  3254,  1010,  8345,  2195,  2335,  2006,  1996,  2126,
          1012,  2006,  2169,  4899,  1010,  4500,  1996,  6336,  1011,  9093,
          1010,  1996, 13082,  2007,  1996,  8216,  2227, 11114,  2013,  1996,
          2813,  1012,  2009,  2001,  2028,  1997,  2216,  4620,  2029,  2024,
          2061,  9530, 18886,  7178,  2008,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False F

output torch.Size([46, 768])
33 those tens shape torch.Size([34, 768])
tensor(1.3464)
counter 233
34 pictures
encoded {'input_ids': tensor([[  101,  2253,  3254,  1010,  8345,  2195,  2335,  2006,  1996,  2126,
          1012,  2006,  2169,  4899,  1010,  4500,  1996,  6336,  1011,  9093,
          1010,  1996, 13082,  2007,  1996,  8216,  2227, 11114,  2013,  1996,
          2813,  1012,  2009,  2001,  2028,  1997,  2216,  4620,  2029,  2024,
          2061,  9530, 18886,  7178,  2008,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False F

output torch.Size([44, 768])
1 eyes tens shape torch.Size([2, 768])
tensor(2.3030)
counter 241
2 follow
encoded {'input_ids': tensor([[  101,  1996,  2159,  3582,  2017,  2055,  2043,  2017,  2693,  1012,
          2502,  2567,  2003,  3666,  2017,  1010,  1996, 14408,  3258,  4218,
          2009,  2743,  1012,  2503,  1996,  4257,  1037,  5909,  2100,  2376,
          2001,  3752,  2041,  1037,  2862,  1997,  4481,  2029,  2018,  2242,
          2000,  2079,  2007,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False Fa

output torch.Size([44, 768])
9 Big tens shape torch.Size([10, 768])
tensor(1.7216)
counter 249
10 Brother
encoded {'input_ids': tensor([[  101,  1996,  2159,  3582,  2017,  2055,  2043,  2017,  2693,  1012,
          2502,  2567,  2003,  3666,  2017,  1010,  1996, 14408,  3258,  4218,
          2009,  2743,  1012,  2503,  1996,  4257,  1037,  5909,  2100,  2376,
          2001,  3752,  2041,  1037,  2862,  1997,  4481,  2029,  2018,  2242,
          2000,  2079,  2007,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False 

output torch.Size([44, 768])
17 beneath tens shape torch.Size([18, 768])
tensor(-1.7936)
counter 257
18 it
encoded {'input_ids': tensor([[  101,  1996,  2159,  3582,  2017,  2055,  2043,  2017,  2693,  1012,
          2502,  2567,  2003,  3666,  2017,  1010,  1996, 14408,  3258,  4218,
          2009,  2743,  1012,  2503,  1996,  4257,  1037,  5909,  2100,  2376,
          2001,  3752,  2041,  1037,  2862,  1997,  4481,  2029,  2018,  2242,
          2000,  2079,  2007,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False  True False False False
 False

output torch.Size([44, 768])
25 fruity tens shape torch.Size([26, 768])
tensor(-1.6655)
counter 265
26 voice
encoded {'input_ids': tensor([[  101,  1996,  2159,  3582,  2017,  2055,  2043,  2017,  2693,  1012,
          2502,  2567,  2003,  3666,  2017,  1010,  1996, 14408,  3258,  4218,
          2009,  2743,  1012,  2503,  1996,  4257,  1037,  5909,  2100,  2376,
          2001,  3752,  2041,  1037,  2862,  1997,  4481,  2029,  2018,  2242,
          2000,  2079,  2007,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fal

output torch.Size([44, 768])
33 figures tens shape torch.Size([34, 768])
tensor(4.4888)
counter 273
34 which
encoded {'input_ids': tensor([[  101,  1996,  2159,  3582,  2017,  2055,  2043,  2017,  2693,  1012,
          2502,  2567,  2003,  3666,  2017,  1010,  1996, 14408,  3258,  4218,
          2009,  2743,  1012,  2503,  1996,  4257,  1037,  5909,  2100,  2376,
          2001,  3752,  2041,  1037,  2862,  1997,  4481,  2029,  2018,  2242,
          2000,  2079,  2007,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False False False False False
 Fal

output torch.Size([48, 768])
1 production tens shape torch.Size([2, 768])
tensor(2.7239)
counter 281
2 of
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False Fal

output torch.Size([48, 768])
8 from tens shape torch.Size([9, 768])
tensor(1.4111)
counter 288
9 an
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False  True False
 False False False False False Fal

output torch.Size([48, 768])
15 dulled tens shape torch.Size([16, 768])
tensor(0.2061)
counter 295
16 mirror
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False 

output torch.Size([48, 768])
22 surface tens shape torch.Size([23, 768])
tensor(-4.4347)
counter 302
23 of
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False Fa

output torch.Size([48, 768])
29 turned tens shape torch.Size([30, 768])
tensor(-2.1444)
counter 309
30 a
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False Fals

output torch.Size([48, 768])
36 somewhat tens shape torch.Size([37, 768])
tensor(-2.4430)
counter 316
37 ,
encoded {'input_ids': tensor([[  101,  1996,  2537,  1997, 10369,  1011,  3707,  1012,  1996,  2376,
          2234,  2013,  2019, 27885, 10052,  3384, 11952,  2066,  1037, 10634,
          2098,  5259,  2029,  2719,  2112,  1997,  1996,  3302,  1997,  1996,
          2157,  1011,  2192,  2813,  1012, 10180,  2357,  1037,  6942,  1998,
          1996,  2376,  7569,  5399,  1010,  2295,  1996,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False Fa

output torch.Size([47, 768])
1 were tens shape torch.Size([2, 768])
tensor(2.4978)
counter 321
2 still
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 False False False False False False False F

output torch.Size([47, 768])
8 the tens shape torch.Size([9, 768])
tensor(-2.8035)
counter 328
9 telescreen
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False  True
  True  True False False False False Fa

output torch.Size([47, 768])
15 could tens shape torch.Size([16, 768])
tensor(3.2988)
counter 335
16 be
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False 

output torch.Size([47, 768])
22 no tens shape torch.Size([23, 768])
tensor(-4.4405)
counter 342
23 way
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False F

output torch.Size([47, 768])
29 . tens shape torch.Size([30, 768])
tensor(0.2531)
counter 349
30 He
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False Fals

output torch.Size([47, 768])
36 : tens shape torch.Size([37, 768])
tensor(0.3582)
counter 356
37 a
encoded {'input_ids': tensor([[  101,  2616,  2020,  2145, 10782,  3085,  1012,  1996,  6602,  1006,
          1996, 10093,  2229, 24410,  1010,  2009,  2001,  2170,  1007,  2071,
          2022, 11737,  7583,  1010,  2021,  2045,  2001,  2053,  2126,  1997,
         17521,  2009,  2125,  3294,  1012,  2002,  2333,  2058,  2000,  1996,
          3332,  1024,  1037,  2235,  4509,  1010,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded.word_ids-idx [False False False False False False False False False False False False
 False False False False False False False False

output torch.Size([49, 768])
1 figure tens shape torch.Size([2, 768])
tensor(5.1025)
counter 361
2 ,
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False  True False False False False False False False False
 F

output torch.Size([49, 768])
8 merely tens shape torch.Size([9, 768])
tensor(0.6038)
counter 368
9 emphasized
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False False False False False False False False False

output torch.Size([49, 768])
15 were tens shape torch.Size([16, 768])
tensor(2.5366)
counter 375
16 the
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False False False False False False False False False False

output torch.Size([49, 768])
22 His tens shape torch.Size([23, 768])
tensor(3.8306)
counter 382
23 hair
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False False False False False False False False False False

output torch.Size([49, 768])
29 face tens shape torch.Size([30, 768])
tensor(4.8590)
counter 389
30 naturally
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False False False False False False False False False

output torch.Size([49, 768])
36 by tens shape torch.Size([37, 768])
tensor(-2.0391)
counter 396
37 coarse
encoded {'input_ids': tensor([[  101, 25737,  3275,  1010,  1996,  2033,  8490,  7389,  7971,  1997,
          2010,  2303,  6414, 13155,  2011,  1996,  2630,  3452,  2015,  2029,
          2020,  1996,  6375,  1997,  1996,  2283,  1012,  2010,  2606,  2001,
          2200,  4189,  1010,  2010,  2227,  8100,  6369, 20023,  2063,  1010,
          2010,  3096,  5931,  6675,  2011, 20392,  7815,  1998,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
encoded.word_ids-idx [False False False False False False False False False False False Fal

This part deals with POS tags from NLTK corpus (data variable instantiated above)

### Label creators


The below creates the labels corresponding to contextual embeddings for tokens tensor above. Some parts are manual, as some parts are split differently above, as they are in the NLTK corpus. For example, '3,000' versus '3'  ','  '000'  splits differently for both. Same for '...' versus '.' ,'.', '.' .

In [1565]:
## ----------- II ----------- #
#test label creator
bubu2=[]
dev_labels = []
tag_counter = 0
counter_label = 0
for idx3, k in enumerate(data):
    #print(idx3, k)
    for idx4, m in enumerate(k):
        #print(tag_counter,m)
        if (40*0)-1 < tag_counter < 40*401:
            if tag_counter in(5920,7200):
                print(tag_counter,m)
                bubu2.append("'s")
                dev_labels.append('VERB')
            elif tag_counter == 10760:
                bubu2.append("'s")
                dev_labels.append('ADP')
                print("s',ADP") 
            elif tag_counter == 8360:
                bubu2.append("'m")
                dev_labels.append('VERB')
                print("m,VERB")
            elif tag_counter == 11286:
                bubu2.append("3")
                dev_labels.append('NUM')
                print("3,NUM")
                bubu2.append(",")
                dev_labels.append('')
                bubu2.append("000")
                dev_labels.append('NUM')
            elif tag_counter == 13297:
                bubu2.append("W")
                dev_labels.append('NOUN')
                bubu2.append(".")
                dev_labels.append('')
            elif m[0] !=  "'s" and  m[0] != "'ve" and  m[0] != "'t" and m[0] !=  "'d" and m[0] !=  "'m" and m[0] !=  "'re" and m[0] !=  "'ave"and m[0] !=  "'ll" and m[0] !=  "'em" :
                if m[0] == '...':
                    bubu2.append('.')
                    dev_labels.append('')
                    bubu2.append('.')
                    dev_labels.append('')
                    print('')  
                elif tag_counter in (13888,13895,13907,13913,16062):
                    bubu2.append('17')
                    dev_labels.append('NUM')
                    bubu2.append('.')
                    dev_labels.append('')
                    bubu2.append('3')
                    dev_labels.append('NUM')
                    bubu2.append('.')
                    dev_labels.append('')
                bubu2.append(m[0])
                dev_labels.append(m[1])
                #print(ww[tag_counter])
            print(tag_counter,m)
        tag_counter += 1
        counter_label +=1

0 ('It', 'PRON')
1 ('was', 'VERB')
2 ('a', 'DET')
3 ('bright', 'ADJ')
4 ('cold', 'ADJ')
5 ('day', 'NOUN')
6 ('in', 'ADP')
7 ('April', 'NOUN')
8 (',', '')
9 ('and', 'CONJ')
10 ('the', 'DET')
11 ('clocks', 'NOUN')
12 ('were', 'VERB')
13 ('striking', 'VERB')
14 ('thirteen', 'NUM')
15 ('.', '')
16 ('Winston', 'NOUN')
17 ('Smith', 'NOUN')
18 (',', '')
19 ('his', 'DET')
20 ('chin', 'NOUN')
21 ('nuzzled', 'VERB')
22 ('into', 'ADP')
23 ('his', 'DET')
24 ('breast', 'NOUN')
25 ('in', 'ADP')
26 ('an', 'DET')
27 ('effort', 'NOUN')
28 ('to', 'ADP')
29 ('escape', 'VERB')
30 ('the', 'DET')
31 ('vile', 'ADJ')
32 ('wind', 'NOUN')
33 (',', '')
34 ('slipped', 'VERB')
35 ('quickly', 'ADV')
36 ('through', 'ADP')
37 ('the', 'DET')
38 ('glass', 'NOUN')
39 ('doors', 'NOUN')
40 ('of', 'ADP')
41 ('Victory', 'NOUN')
42 ('Mansions', 'NOUN')
43 (',', '')
44 ('though', 'CONJ')
45 ('not', 'ADV')
46 ('quickly', 'ADV')
47 ('enough', 'ADV')
48 ('to', 'ADP')
49 ('prevent', 'VERB')
50 ('a', 'DET')
51 ('swirl', 'NOUN')
52

893 ('of', 'ADP')
894 ('timber', 'NOUN')
895 (',', '')
896 ('their', 'DET')
897 ('windows', 'NOUN')
898 ('patched', 'VERB')
899 ('with', 'ADP')
900 ('cardboard', 'NOUN')
901 ('and', 'CONJ')
902 ('their', 'DET')
903 ('roofs', 'NOUN')
904 ('with', 'ADP')
905 ('corrugated', 'ADJ')
906 ('iron', 'NOUN')
907 (',', '')
908 ('their', 'DET')
909 ('crazy', 'ADJ')
910 ('garden', 'NOUN')
911 ('walls', 'NOUN')
912 ('sagging', 'VERB')
913 ('in', 'ADP')
914 ('all', 'DET')
915 ('directions', 'NOUN')
916 ('?', '')
917 ('And', 'CONJ')
918 ('the', 'DET')
919 ('bombed', 'ADJ')
920 ('sites', 'NOUN')
921 ('where', 'ADV')
922 ('the', 'DET')
923 ('plaster', 'NOUN')
924 ('dust', 'NOUN')
925 ('swirled', 'VERB')
926 ('in', 'ADP')
927 ('the', 'DET')
928 ('air', 'NOUN')
929 ('and', 'CONJ')
930 ('the', 'DET')
931 ('willow-herb', 'NOUN')
932 ('straggled', 'VERB')
933 ('over', 'ADP')
934 ('the', 'DET')
935 ('heaps', 'NOUN')
936 ('of', 'ADP')
937 ('rubble', 'NOUN')
938 (';', '')
939 ('and', 'CONJ')
940 ('the', 'DET')


1796 ('Its', 'DET')
1797 ('smooth', 'ADJ')
1798 ('creamy', 'ADJ')
1799 ('paper', 'NOUN')
1800 (',', '')
1801 ('a', 'DET')
1802 ('little', 'ADV')
1803 ('yellowed', 'VERB')
1804 ('by', 'ADP')
1805 ('age', 'NOUN')
1806 (',', '')
1807 ('was', 'VERB')
1808 ('of', 'ADP')
1809 ('a', 'DET')
1810 ('kind', 'NOUN')
1811 ('that', 'PRON')
1812 ('had', 'VERB')
1813 ('not', 'ADV')
1814 ('been', 'VERB')
1815 ('manufactured', 'VERB')
1816 ('for', 'ADP')
1817 ('at', 'ADP')
1818 ('least', 'ADJ')
1819 ('forty', 'NUM')
1820 ('years', 'NOUN')
1821 ('past', 'ADV')
1822 ('.', '')
1823 ('He', 'PRON')
1824 ('could', 'VERB')
1825 ('guess', 'VERB')
1826 (',', '')
1827 ('however', 'ADV')
1828 (',', '')
1829 ('that', 'CONJ')
1830 ('the', 'DET')
1831 ('book', 'NOUN')
1832 ('was', 'VERB')
1833 ('much', 'ADV')
1834 ('older', 'ADJ')
1835 ('than', 'ADP')
1836 ('that', 'PRON')
1837 ('.', '')
1838 ('He', 'PRON')
1839 ('had', 'VERB')
1840 ('seen', 'VERB')
1841 ('it', 'PRON')
1842 ('lying', 'VERB')
1843 ('in', 'ADP')
1844 (

2553 ('The', 'DET')
2554 ('seconds', 'NOUN')
2555 ('were', 'VERB')
2556 ('ticking', 'VERB')
2557 ('by', 'ADV')
2558 ('.', '')
2559 ('He', 'PRON')
2560 ('was', 'VERB')
2561 ('conscious', 'ADJ')
2562 ('of', 'ADP')
2563 ('nothing', 'PRON')
2564 ('except', 'ADP')
2565 ('the', 'DET')
2566 ('blankness', 'NOUN')
2567 ('of', 'ADP')
2568 ('the', 'DET')
2569 ('page', 'NOUN')
2570 ('in', 'ADP')
2571 ('front', 'NOUN')
2572 ('of', 'ADP')
2573 ('him', 'PRON')
2574 (',', '')
2575 ('the', 'DET')
2576 ('itching', 'NOUN')
2577 ('of', 'ADP')
2578 ('the', 'DET')
2579 ('skin', 'NOUN')
2580 ('above', 'ADP')
2581 ('his', 'DET')
2582 ('ankle', 'NOUN')
2583 (',', '')
2584 ('the', 'DET')
2585 ('blaring', 'NOUN')
2586 ('of', 'ADP')
2587 ('the', 'DET')
2588 ('music', 'NOUN')
2589 (',', '')
2590 ('and', 'CONJ')
2591 ('a', 'DET')
2592 ('slight', 'ADJ')
2593 ('booziness', 'NOUN')
2594 ('caused', 'VERB')
2595 ('by', 'ADP')
2596 ('the', 'DET')
2597 ('gin', 'NOUN')
2598 ('.', '')
2599 ('Suddenly', 'ADV')
2600 ('he', 'P

3662 ('was', 'VERB')
3663 ('intrigued', 'VERB')
3664 ('by', 'ADP')
3665 ('the', 'DET')
3666 ('contrast', 'NOUN')
3667 ('between', 'ADP')
3668 ("O'Brien", 'NOUN')
3669 ("'s", 'ADP')
3670 ('urbane', 'ADJ')
3671 ('manner', 'NOUN')
3672 ('and', 'CONJ')
3673 ('his', 'DET')
3674 ('prize-fighter', 'NOUN')
3675 ("'s", 'ADP')
3676 ('physique', 'NOUN')
3677 ('.', '')
3678 ('Much', 'ADV')
3679 ('more', 'ADV')
3680 ('it', 'PRON')
3681 ('was', 'VERB')
3682 ('because', 'CONJ')
3683 ('of', 'ADP')
3684 ('a', 'DET')
3685 ('secretly', 'ADV')
3686 ('held', 'ADJ')
3687 ('belief', 'NOUN')
3688 ('-', '')
3689 ('or', 'CONJ')
3690 ('perhaps', 'ADV')
3691 ('not', 'ADV')
3692 ('even', 'ADV')
3693 ('a', 'DET')
3694 ('belief', 'NOUN')
3695 (',', '')
3696 ('merely', 'ADV')
3697 ('a', 'DET')
3698 ('hope', 'NOUN')
3699 ('-', '')
3700 ('that', 'CONJ')
3701 ("O'Brien", 'NOUN')
3702 ("'s", 'ADP')
3703 ('political', 'ADJ')
3704 ('orthodoxy', 'NOUN')
3705 ('was', 'VERB')
3706 ('not', 'ADV')
3707 ('perfect', 'ADJ')
3708 (

4510 ('The', 'DET')
4511 ('self-satisfied', 'ADJ')
4512 ('sheep-like', 'ADJ')
4513 ('face', 'NOUN')
4514 ('on', 'ADP')
4515 ('the', 'DET')
4516 ('screen', 'NOUN')
4517 (',', '')
4518 ('and', 'CONJ')
4519 ('the', 'DET')
4520 ('terrifying', 'ADJ')
4521 ('power', 'NOUN')
4522 ('of', 'ADP')
4523 ('the', 'DET')
4524 ('Eurasian', 'ADJ')
4525 ('army', 'NOUN')
4526 ('behind', 'ADP')
4527 ('it', 'PRON')
4528 (',', '')
4529 ('were', 'VERB')
4530 ('too', 'ADV')
4531 ('much', 'ADJ')
4532 ('to', 'ADP')
4533 ('be', 'VERB')
4534 ('borne', 'VERB')
4535 (':', '')
4536 ('besides', 'CONJ')
4537 (',', '')
4538 ('the', 'DET')
4539 ('sight', 'NOUN')
4540 ('or', 'CONJ')
4541 ('even', 'ADV')
4542 ('the', 'DET')
4543 ('thought', 'NOUN')
4544 ('of', 'ADP')
4545 ('Goldstein', 'NOUN')
4546 ('produced', 'VERB')
4547 ('fear', 'NOUN')
4548 ('and', 'CONJ')
4549 ('anger', 'NOUN')
4550 ('automatically', 'ADV')
4551 ('.', '')
4552 ('He', 'PRON')
4553 ('was', 'VERB')
4554 ('an', 'DET')
4555 ('object', 'NOUN')
4556 ('of',

5174 ('And', 'CONJ')
5175 ('yet', 'CONJ')
5176 ('the', 'DET')
5177 ('very', 'ADV')
5178 ('next', 'ADJ')
5179 ('instant', 'NOUN')
5180 ('he', 'PRON')
5181 ('was', 'VERB')
5182 ('at', 'ADP')
5183 ('one', 'PRON')
5184 ('with', 'ADP')
5185 ('the', 'DET')
5186 ('people', 'NOUN')
5187 ('about', 'ADP')
5188 ('him', 'PRON')
5189 (',', '')
5190 ('and', 'CONJ')
5191 ('all', 'PRON')
5192 ('that', 'PRON')
5193 ('was', 'VERB')
5194 ('said', 'VERB')
5195 ('of', 'ADP')
5196 ('Goldstein', 'NOUN')
5197 ('seemed', 'VERB')
5198 ('to', 'ADP')
5199 ('him', 'PRON')
5200 ('to', 'ADP')
5201 ('be', 'VERB')
5202 ('true', 'ADJ')
5203 ('.', '')
5204 ('At', 'ADP')
5205 ('those', 'DET')
5206 ('moments', 'NOUN')
5207 ('his', 'DET')
5208 ('secret', 'ADJ')
5209 ('loathing', 'NOUN')
5210 ('of', 'ADP')
5211 ('Big', 'ADJ')
5212 ('Brother', 'NOUN')
5213 ('changed', 'VERB')
5214 ('into', 'ADP')
5215 ('adoration', 'NOUN')
5216 (',', '')
5217 ('and', 'CONJ')
5218 ('Big', 'ADJ')
5219 ('Brother', 'NOUN')
5220 ('seemed', 'VERB'

5996 ('But', 'CONJ')
5997 ('there', 'PRON')
5998 ('was', 'VERB')
5999 ('a', 'DET')
6000 ('space', 'NOUN')
6001 ('of', 'ADP')
6002 ('a', 'DET')
6003 ('couple', 'NOUN')
6004 ('of', 'ADP')
6005 ('seconds', 'NOUN')
6006 ('during', 'ADP')
6007 ('which', 'PRON')
6008 ('the', 'DET')
6009 ('expression', 'NOUN')
6010 ('of', 'ADP')
6011 ('his', 'DET')
6012 ('eyes', 'NOUN')
6013 ('might', 'VERB')
6014 ('conceivably', 'ADV')
6015 ('have', 'VERB')
6016 ('betrayed', 'VERB')
6017 ('him', 'PRON')
6018 ('.', '')
6019 ('And', 'CONJ')
6020 ('it', 'PRON')
6021 ('was', 'VERB')
6022 ('exactly', 'ADV')
6023 ('at', 'ADP')
6024 ('this', 'DET')
6025 ('moment', 'NOUN')
6026 ('that', 'CONJ')
6027 ('the', 'DET')
6028 ('significant', 'ADJ')
6029 ('thing', 'NOUN')
6030 ('happened', 'VERB')
6031 ('-', '')
6032 ('if', 'CONJ')
6033 (',', '')
6034 ('indeed', 'ADV')
6035 (',', '')
6036 ('it', 'PRON')
6037 ('did', 'VERB')
6038 ('happen', 'VERB')
6039 ('.', '')
6040 ('Momentarily', 'ADV')
6041 ('he', 'PRON')
6042 ('caught'

6639 ('was', 'VERB')
6640 ('useless', 'ADJ')
6641 ('.', '')
6642 ('Whether', 'CONJ')
6643 ('he', 'PRON')
6644 ('wrote', 'VERB')
6645 ('Down', 'ADV')
6646 ('with', 'ADP')
6647 ('Big', 'ADJ')
6648 ('Brother', 'NOUN')
6649 (',', '')
6650 ('or', 'CONJ')
6651 ('whether', 'CONJ')
6652 ('he', 'PRON')
6653 ('refrained', 'VERB')
6654 ('from', 'ADP')
6655 ('writing', 'VERB')
6656 ('it', 'PRON')
6657 (',', '')
6658 ('made', 'VERB')
6659 ('no', 'DET')
6660 ('difference', 'NOUN')
6661 ('.', '')
6662 ('Whether', 'CONJ')
6663 ('he', 'PRON')
6664 ('went', 'VERB')
6665 ('on', 'ADP')
6666 ('with', 'ADP')
6667 ('the', 'DET')
6668 ('diary', 'NOUN')
6669 (',', '')
6670 ('or', 'CONJ')
6671 ('whether', 'CONJ')
6672 ('he', 'PRON')
6673 ('did', 'VERB')
6674 ('not', 'ADV')
6675 ('go', 'VERB')
6676 ('on', 'ADP')
6677 ('with', 'ADP')
6678 ('it', 'PRON')
6679 (',', '')
6680 ('made', 'VERB')
6681 ('no', 'DET')
6682 ('difference', 'NOUN')
6683 ('.', '')
6684 ('The', 'DET')
6685 ('Thought', 'NOUN')
6686 ('Police', 'N

7388 ('for', 'ADP')
7389 ('two', 'NUM')
7390 ('years', 'NOUN')
7391 ('.', '')
7392 ('Of', 'ADP')
7393 ('course', 'NOUN')
7394 ('it', 'PRON')
7395 ("'s", 'VERB')
7396 ('only', 'ADV')
7397 ('because', 'CONJ')
7398 ('Tom', 'NOUN')
7399 ("isn't", 'VERB')
7400 ('home', 'ADV')
7401 (',', '')
7402 ('said', 'VERB')
7403 ('Mrs', 'NOUN')
7404 ('Parsons', 'NOUN')
7405 ('vaguely', 'ADV')
7406 ('.', '')
7407 ('The', 'DET')
7408 ('Parsons', 'NOUN')
7409 ("'", 'ADP')
7410 ('flat', 'NOUN')
7411 ('was', 'VERB')
7412 ('bigger', 'ADJ')
7413 ('than', 'ADP')
7414 ('Winston', 'NOUN')
7415 ("'s", 'ADP')
7416 (',', '')
7417 ('and', 'CONJ')
7418 ('dingy', 'ADJ')
7419 ('in', 'ADP')
7420 ('a', 'DET')
7421 ('different', 'ADJ')
7422 ('way', 'NOUN')
7423 ('.', '')
7424 ('Everything', 'PRON')
7425 ('had', 'VERB')
7426 ('a', 'DET')
7427 ('battered', 'ADJ')
7428 (',', '')
7429 ('trampled-on', 'VERB')
7430 ('look', 'NOUN')
7431 (',', '')
7432 ('as', 'CONJ')
7433 ('though', 'CONJ')
7434 ('the', 'DET')
7435 ('place', 'NO

8270 ('enough', 'ADV')
8271 ('to', 'ADP')
8272 ('do', 'VERB')
8273 ('so', 'ADV')
8274 ('.', '')
8275 ('It', 'PRON')
8276 ('was', 'VERB')
8277 ('a', 'DET')
8278 ('good', 'ADJ')
8279 ('job', 'NOUN')
8280 ('it', 'PRON')
8281 ('was', 'VERB')
8282 ('not', 'ADV')
8283 ('a', 'DET')
8284 ('real', 'ADJ')
8285 ('pistol', 'NOUN')
8286 ('he', 'PRON')
8287 ('was', 'VERB')
8288 ('holding', 'VERB')
8289 (',', '')
8290 ('Winston', 'NOUN')
8291 ('thought', 'VERB')
8292 ('.', '')
8293 ('Mrs', 'NOUN')
8294 ('Parsons', 'NOUN')
8295 ("'", 'ADP')
8296 ('eyes', 'NOUN')
8297 ('flitted', 'VERB')
8298 ('nervously', 'ADV')
8299 ('from', 'ADP')
8300 ('Winston', 'NOUN')
8301 ('to', 'ADP')
8302 ('the', 'DET')
8303 ('children', 'NOUN')
8304 (',', '')
8305 ('and', 'CONJ')
8306 ('back', 'ADV')
8307 ('again', 'ADV')
8308 ('.', '')
8309 ('In', 'ADP')
8310 ('the', 'DET')
8311 ('better', 'ADJ')
8312 ('light', 'NOUN')
8313 ('of', 'ADP')
8314 ('the', 'DET')
8315 ('living-room', 'NOUN')
8316 ('he', 'PRON')
8317 ('noticed', '

8887 ('the', 'DET')
8888 ('diary', 'NOUN')
8889 ('.', '')
8890 ('Suddenly', 'ADV')
8891 ('he', 'PRON')
8892 ('began', 'VERB')
8893 ('thinking', 'VERB')
8894 ('of', 'ADP')
8895 ("O'Brien", 'NOUN')
8896 ('again', 'ADV')
8897 ('.', '')
8898 ('Years', 'NOUN')
8899 ('ago', 'ADP')
8900 ('-', '')
8901 ('how', 'ADV')
8902 ('long', 'ADV')
8903 ('was', 'VERB')
8904 ('it', 'PRON')
8905 ('?', '')
8906 ('Seven', 'NUM')
8907 ('years', 'NOUN')
8908 ('it', 'PRON')
8909 ('must', 'VERB')
8910 ('be', 'VERB')
8911 ('-', '')
8912 ('he', 'PRON')
8913 ('had', 'VERB')
8914 ('dreamed', 'VERB')
8915 ('that', 'CONJ')
8916 ('he', 'PRON')
8917 ('was', 'VERB')
8918 ('walking', 'VERB')
8919 ('through', 'ADP')
8920 ('a', 'DET')
8921 ('pitch-dark', 'ADJ')
8922 ('room', 'NOUN')
8923 ('.', '')
8924 ('And', 'CONJ')
8925 ('someone', 'PRON')
8926 ('sitting', 'VERB')
8927 ('to', 'ADP')
8928 ('one', 'NUM')
8929 ('side', 'NOUN')
8930 ('of', 'ADP')
8931 ('him', 'PRON')
8932 ('had', 'VERB')
8933 ('said', 'VERB')
8934 ('as', 'CO

9623 ('On', 'ADP')
9624 ('coins', 'NOUN')
9625 (',', '')
9626 ('on', 'ADP')
9627 ('stamps', 'NOUN')
9628 (',', '')
9629 ('on', 'ADP')
9630 ('the', 'DET')
9631 ('covers', 'NOUN')
9632 ('of', 'ADP')
9633 ('books', 'NOUN')
9634 (',', '')
9635 ('on', 'ADP')
9636 ('banners', 'NOUN')
9637 (',', '')
9638 ('on', 'ADP')
9639 ('posters', 'NOUN')
9640 (',', '')
9641 ('and', 'CONJ')
9642 ('on', 'ADP')
9643 ('the', 'DET')
9644 ('wrappings', 'NOUN')
9645 ('of', 'ADP')
9646 ('a', 'DET')
9647 ('cigarette', 'NOUN')
9648 ('Packet', 'NOUN')
9649 ('-', '')
9650 ('everywhere', 'ADV')
9651 ('.', '')
9652 ('Always', 'ADV')
9653 ('the', 'DET')
9654 ('eyes', 'NOUN')
9655 ('watching', 'VERB')
9656 ('you', 'PRON')
9657 ('and', 'CONJ')
9658 ('the', 'DET')
9659 ('voice', 'NOUN')
9660 ('enveloping', 'VERB')
9661 ('you', 'PRON')
9662 ('.', '')
9663 ('Asleep', 'ADJ')
9664 ('or', 'CONJ')
9665 ('awake', 'ADJ')
9666 (',', '')
9667 ('working', 'VERB')
9668 ('or', 'CONJ')
9669 ('eating', 'VERB')
9670 (',', '')
9671 ('indo

10448 ('He', 'PRON')
10449 ('did', 'VERB')
10450 ('not', 'ADV')
10451 ('remember', 'VERB')
10452 ('his', 'DET')
10453 ('sister', 'NOUN')
10454 ('at', 'ADP')
10455 ('all', 'ADV')
10456 (',', '')
10457 ('except', 'CONJ')
10458 ('as', 'ADP')
10459 ('a', 'DET')
10460 ('tiny', 'ADJ')
10461 (',', '')
10462 ('feeble', 'ADJ')
10463 ('baby', 'NOUN')
10464 (',', '')
10465 ('always', 'ADV')
10466 ('silent', 'ADJ')
10467 (',', '')
10468 ('with', 'ADP')
10469 ('large', 'ADJ')
10470 (',', '')
10471 ('watchful', 'ADJ')
10472 ('eyes', 'NOUN')
10473 ('.', '')
10474 ('Both', 'PRON')
10475 ('of', 'ADP')
10476 ('them', 'PRON')
10477 ('were', 'VERB')
10478 ('looking', 'VERB')
10479 ('up', 'ADP')
10480 ('at', 'ADP')
10481 ('him', 'PRON')
10482 ('.', '')
10483 ('They', 'PRON')
10484 ('were', 'VERB')
10485 ('down', 'ADV')
10486 ('in', 'ADP')
10487 ('some', 'DET')
10488 ('subterranean', 'ADJ')
10489 ('place', 'NOUN')
10490 ('-', '')
10491 ('the', 'DET')
10492 ('bottom', 'NOUN')
10493 ('of', 'ADP')
10494 ('a', 

11267 ('Winston', 'NOUN')
11268 ('wrenched', 'VERB')
11269 ('his', 'DET')
11270 ('body', 'NOUN')
11271 ('out', 'ADP')
11272 ('of', 'ADP')
11273 ('bed', 'NOUN')
11274 ('-', '')
11275 ('naked', 'ADJ')
11276 (',', '')
11277 ('for', 'CONJ')
11278 ('a', 'DET')
11279 ('member', 'NOUN')
11280 ('of', 'ADP')
11281 ('the', 'DET')
11282 ('Outer', 'ADJ')
11283 ('Party', 'NOUN')
11284 ('received', 'VERB')
11285 ('only', 'ADV')
3,NUM
11286 ('3,000', 'NUM')
11287 ('clothing', 'NOUN')
11288 ('coupons', 'NOUN')
11289 ('annually', 'ADV')
11290 (',', '')
11291 ('and', 'CONJ')
11292 ('a', 'DET')
11293 ('suit', 'NOUN')
11294 ('of', 'ADP')
11295 ('pyjamas', 'NOUN')
11296 ('was', 'VERB')
11297 ('600', 'NUM')
11298 ('-', '')
11299 ('and', 'CONJ')
11300 ('seized', 'VERB')
11301 ('a', 'DET')
11302 ('dingy', 'ADJ')
11303 ('singlet', 'NOUN')
11304 ('and', 'CONJ')
11305 ('a', 'DET')
11306 ('pair', 'NOUN')
11307 ('of', 'ADP')
11308 ('shorts', 'NOUN')
11309 ('that', 'PRON')
11310 ('were', 'VERB')
11311 ('lying', 'VE

12049 ('his', 'DET')
12050 ('eyes', 'NOUN')
12051 ('were', 'VERB')
12052 ('pure', 'ADJ')
12053 ('gin', 'NOUN')
12054 ('.', '')
12055 ('But', 'CONJ')
12056 ('though', 'CONJ')
12057 ('slightly', 'ADV')
12058 ('drunk', 'ADJ')
12059 ('he', 'PRON')
12060 ('was', 'VERB')
12061 ('also', 'ADV')
12062 ('suffering', 'VERB')
12063 ('under', 'ADP')
12064 ('some', 'DET')
12065 ('grief', 'NOUN')
12066 ('that', 'PRON')
12067 ('was', 'VERB')
12068 ('genuine', 'ADJ')
12069 ('and', 'CONJ')
12070 ('unbearable', 'ADJ')
12071 ('.', '')
12072 ('In', 'ADP')
12073 ('his', 'DET')
12074 ('childish', 'ADJ')
12075 ('way', 'NOUN')
12076 ('Winston', 'NOUN')
12077 ('grasped', 'VERB')
12078 ('that', 'CONJ')
12079 ('some', 'DET')
12080 ('terrible', 'ADJ')
12081 ('thing', 'NOUN')
12082 (',', '')
12083 ('something', 'PRON')
12084 ('that', 'PRON')
12085 ('was', 'VERB')
12086 ('beyond', 'ADP')
12087 ('forgiveness', 'NOUN')
12088 ('and', 'CONJ')
12089 ('could', 'VERB')
12090 ('never', 'ADV')
12091 ('be', 'VERB')
12092 ('re

12937 ('Winston', 'NOUN')
12938 ('loathed', 'VERB')
12939 ('this', 'DET')
12940 ('exercise', 'NOUN')
12941 (',', '')
12942 ('which', 'PRON')
12943 ('sent', 'VERB')
12944 ('shooting', 'ADJ')
12945 ('pains', 'NOUN')
12946 ('all', 'DET')
12947 ('the', 'DET')
12948 ('way', 'NOUN')
12949 ('from', 'ADP')
12950 ('his', 'DET')
12951 ('heels', 'NOUN')
12952 ('to', 'ADP')
12953 ('his', 'DET')
12954 ('buttocks', 'NOUN')
12955 ('and', 'CONJ')
12956 ('often', 'ADV')
12957 ('ended', 'VERB')
12958 ('by', 'ADP')
12959 ('bringing', 'VERB')
12960 ('on', 'ADP')
12961 ('another', 'DET')
12962 ('coughing', 'VERB')
12963 ('fit', 'NOUN')
12964 ('.', '')
12965 ('The', 'DET')
12966 ('half-pleasant', 'ADJ')
12967 ('quality', 'NOUN')
12968 ('went', 'VERB')
12969 ('out', 'ADP')
12970 ('of', 'ADP')
12971 ('his', 'DET')
12972 ('meditations', 'NOUN')
12973 ('.', '')
12974 ('The', 'DET')
12975 ('past', 'NOUN')
12976 (',', '')
12977 ('he', 'PRON')
12978 ('reflected', 'VERB')
12979 (',', '')
12980 ('had', 'VERB')
12981

14091 ('As', 'CONJ')
14092 ('it', 'PRON')
14093 ('happened', 'VERB')
14094 (',', '')
14095 ('the', 'DET')
14096 ('Eurasian', 'ADJ')
14097 ('Higher', 'ADJ')
14098 ('Command', 'NOUN')
14099 ('had', 'VERB')
14100 ('launched', 'VERB')
14101 ('its', 'DET')
14102 ('offensive', 'NOUN')
14103 ('in', 'ADP')
14104 ('South', 'ADJ')
14105 ('India', 'NOUN')
14106 ('and', 'CONJ')
14107 ('left', 'VERB')
14108 ('North', 'ADJ')
14109 ('Africa', 'NOUN')
14110 ('alone', 'ADJ')
14111 ('.', '')
14112 ('It', 'PRON')
14113 ('was', 'VERB')
14114 ('therefore', 'ADV')
14115 ('necessary', 'ADJ')
14116 ('to', 'ADP')
14117 ('rewrite', 'VERB')
14118 ('a', 'DET')
14119 ('paragraph', 'NOUN')
14120 ('of', 'ADP')
14121 ('Big', 'ADJ')
14122 ('Brother', 'NOUN')
14123 ("'s", 'ADP')
14124 ('speech', 'NOUN')
14125 (',', '')
14126 ('in', 'ADP')
14127 ('such', 'ADJ')
14128 ('a', 'DET')
14129 ('way', 'NOUN')
14130 ('as', 'CONJ')
14131 ('to', 'ADP')
14132 ('make', 'VERB')
14133 ('him', 'PRON')
14134 ('predict', 'VERB')
14135 ('

14778 ('instructions', 'NOUN')
14779 ('which', 'PRON')
14780 ('Winston', 'NOUN')
14781 ('received', 'VERB')
14782 (',', '')
14783 ('and', 'CONJ')
14784 ('which', 'PRON')
14785 ('he', 'PRON')
14786 ('invariably', 'ADV')
14787 ('got', 'VERB')
14788 ('rid', 'VERB')
14789 ('of', 'ADP')
14790 ('as', 'ADV')
14791 ('soon', 'ADV')
14792 ('as', 'CONJ')
14793 ('he', 'PRON')
14794 ('had', 'VERB')
14795 ('dealt', 'VERB')
14796 ('with', 'ADP')
14797 ('them', 'PRON')
14798 (',', '')
14799 ('never', 'ADV')
14800 ('stated', 'VERB')
14801 ('or', 'CONJ')
14802 ('implied', 'VERB')
14803 ('that', 'CONJ')
14804 ('an', 'DET')
14805 ('act', 'NOUN')
14806 ('of', 'ADP')
14807 ('forgery', 'NOUN')
14808 ('was', 'VERB')
14809 ('to', 'ADP')
14810 ('be', 'VERB')
14811 ('committed', 'VERB')
14812 (':', '')
14813 ('always', 'ADV')
14814 ('the', 'DET')
14815 ('reference', 'NOUN')
14816 ('was', 'VERB')
14817 ('to', 'ADP')
14818 ('slips', 'NOUN')
14819 (',', '')
14820 ('errors', 'NOUN')
14821 (',', '')
14822 ('misprints

15730 ('the', 'DET')
15731 ('whole', 'ADJ')
15732 ('operation', 'NOUN')
15733 ('at', 'ADP')
15734 ('a', 'DET')
15735 ('lower', 'ADJ')
15736 ('level', 'NOUN')
15737 ('for', 'ADP')
15738 ('the', 'DET')
15739 ('benefit', 'NOUN')
15740 ('of', 'ADP')
15741 ('the', 'DET')
15742 ('proletariat', 'NOUN')
15743 ('.', '')
15744 ('There', 'PRON')
15745 ('was', 'VERB')
15746 ('a', 'DET')
15747 ('whole', 'ADJ')
15748 ('chain', 'NOUN')
15749 ('of', 'ADP')
15750 ('separate', 'ADJ')
15751 ('departments', 'NOUN')
15752 ('dealing', 'VERB')
15753 ('with', 'ADP')
15754 ('proletarian', 'ADJ')
15755 ('literature', 'NOUN')
15756 (',', '')
15757 ('music', 'NOUN')
15758 (',', '')
15759 ('drama', 'NOUN')
15760 (',', '')
15761 ('and', 'CONJ')
15762 ('entertainment', 'NOUN')
15763 ('generally', 'ADV')
15764 ('.', '')
15765 ('Here', 'ADV')
15766 ('were', 'VERB')
15767 ('produced', 'VERB')
15768 ('rubbishy', 'ADJ')
15769 ('newspapers', 'NOUN')
15770 ('containing', 'VERB')
15771 ('almost', 'ADV')
15772 ('nothing', 'P

The below code checks whether the two match

In [50]:
for idx, x in enumerate(extra_set_list):
    if not x == bubu2[idx]:
        print(x, bubu2[idx], idx)
       #break
print(extra_set_list[15650:15660])
print(dev_labels[15650:15660])
print(tens[15650][0])
print(tens[15659][0])
print(dev_labels[15659])
print(extra_set_list[15659])

#Taking a section of the tensor, and corresponding labels that we know match for trial
xx = np.copy(tens[0:15659])
yy = np.copy(dev_labels[0:15659])

Winston's Winston 510
people's people 576
Winston's Winston 594
tomorrow's tomorrow 1392
child's child 2883
O'Brien's O'Brien 3663
prize-fighter's prize-fighter 3668
O'Brien's O'Brien 3694
one's one 3894
one's one 3906
Party's Party 4058
Winston's Winston 4128
Goldstein's Goldstein 4403
Goldstein's Goldstein 4468
O'Brien's O'Brien 4867
Goldstein's Goldstein 4934
one's one 5055
Winston's Winston 5098
one's one 5276
one's one 5299
sheep's sheep 5470
everyone's everyone 5691
. ... 5779
. ... 5784
. ... 5933
O'Brien's O'Brien 6027
O'Brien's O'Brien 6181
else's else 6188
it's it 7375
Winston's Winston 7394
It's It 7572
he'd he 7676
He's He 7693
Winston's Winston 7706
I'm I 7947
boy's boy 8119
You're You 8130
You're You 8138
You're You 8142
I'll I 8147
I'll I 8151
I'll I 8155
boy's boy 8216
They're They 8307
that's that 8318
woman's woman 8522
O'Brien's O'Brien 9011
morning's morning 9046
,'t , 9303
,'t , 9327
father's father 10355
one's one 10681
mother's mother 10783
women's women 11021
. 

IndexError: index 15650 is out of bounds for dimension 0 with size 40

Removing NaN values

In [1576]:
#Before removing any nan values (shape)
print(np.shape(xx))
print(np.shape(yy))
#After removing nan values (shape)
print(np.shape(xx[~np.isnan(xx).any(axis=1),:]))
print(np.shape(yy[~np.isnan(xx[:,0])]))

#Removing any NaN values for both x and corresponding labels
yy = yy[~np.isnan(xx[:,0])]
xx = xx[~np.isnan(xx).any(axis=1),:]

#Check shape again and that they match
print(np.shape(xx))
print(np.shape(yy))

(15922, 768)
(15922,)
(15952, 768)
(15952,)


In [1598]:
#Assigning to another variable for splitting
fun_train = xx
fun_label =  yy
np.shape(fun_train),np.shape(fun_label)

40
(15922, 768)
(15992,)


((15922, 768), (15922,))

Splitting dataset to train and test - shuffle: True

In [1581]:
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(fun_train, fun_label, test_size=TEST_SIZE, shuffle=True,random_state=2)

Label encoder: (only required for keras sequential layers below)

In [20]:
from sklearn.preprocessing import LabelEncoder
def label_encoder(train_labels, test_labels):
	le = LabelEncoder()
	le.fit(np.concatenate((train_labels,test_labels)))
    
    #transform
	#train_label_enc = le.transform(train_labels)
	#test_label_enc = le.transform(test_labels)

    #transform + one-hot
	train_label_enc = np_utils.to_categorical(le.transform(train_labels))
	test_label_enc = np_utils.to_categorical(le.transform(test_labels))
    
	return train_label_enc, test_label_enc

In [1582]:
Y_traini, Y_testi = label_encoder(Y_train,Y_test)

Scaling to experiment below:

In [1583]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Below using logistic regression on training set composed of embeddings and coresponding POS tags. Encoding not required for logistic regression (inbuilt). Commented out some accuracies below for reference.

In [1601]:
clf = LogisticRegression(max_iter=1000000)
clf.fit(X_scaled[0:-100], Y_train[0:-100])
dev_pred_labels = clf.predict(X_test_scaled)

print('train_acc',clf.score(X_scaled[0:-100], Y_train[0:-100]))
print('test_acc',clf.score(X_test_scaled,Y_test))
print('left_out_set_acc',clf.score(X_scaled[-100:],Y_train[-100:]))

#print('acc', accuracy_score(Y_test ,dev_pred_labels))

#For partial dataset:
# train_acc 0.9802
# test_acc 0.4919735599622285

# for whole dataset:
#train_acc 0.7923755513547575
#test_acc 0.5873465533522191

#with standard scaling:
#train_acc 0.7864944339424491
#test_acc 0.6005665722379604

#and no shuffling
#train_acc 0.7827137156059651
#test_acc 0.6203966005665722

train_acc 0.7525476140276899
test_acc 0.6126804770872567
left_out_set_acc 0.62


Confusion matrix to see what is being misclassified below:

In [1585]:
import pandas
import seaborn
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,dev_pred_labels)

array([[122,   2,   8,   4,   4,   7,  10,   0,   9,  12,   0],
       [ 10,  60,   7,   6,   1,   5,  16,   0,   0,   5,   0],
       [  8,  10, 117,   9,   6,  17,  15,   1,  10,  20,   0],
       [  7,   4,   6,  49,   7,   2,  12,   0,   6,  11,   0],
       [  6,   1,   1,   6,  41,   4,   8,   1,   2,   5,   0],
       [  7,   4,  12,   6,   0, 141,  25,   1,   4,  11,   0],
       [ 33,  10,  17,   4,   6,  19, 187,   0,   7,   8,   0],
       [  3,   0,   0,   2,   0,   2,   3,   8,   1,   1,   0],
       [  4,   3,   5,   2,   2,   5,  10,   2,  89,  12,   0],
       [ 15,   6,  12,  11,   9,   8,  18,   0,  14, 163,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1]])

In [1586]:
print(np.shape(X_train))
print(np.shape(X_test))

print(np.shape(Y_traini))
print(np.shape(Y_testi))

(14329, 768)
(1593, 768)
(14329, 11)
(1593, 11)


Tried implementing a sequential layer with one dense layer (vocab size) and classification layer (with softmax)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional,Flatten, Dense, TimeDistributed, Dropout, Activation,SimpleRNN, LSTM

model = Sequential ()
#model.add(Bidirectional(LSTM(64, return_sequences = True)))
model.add(Dense(768, activation='linear'))
#model.add(Dropout(0.2))
#model.add(Bidirectional(LSTM(32, return_sequences = True)))
#model.add(Dense(11, activation='sigmoid'))
#model.add(Dropout(0.4))
#model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(Dense(11, activation='softmax'))
#model.add(TimeDistributed(Dense(11, activation='softmax')))

In [22]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
model.build((14329, 768))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (14329, 768)              590592    
_________________________________________________________________
dense_1 (Dense)              (14329, 11)               8459      
Total params: 599,051
Trainable params: 599,051
Non-trainable params: 0
_________________________________________________________________


Output results below:

In [1597]:
history = model.fit(X_train, Y_traini, batch_size=32, epochs=400, validation_data=(X_test,Y_testi))

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400


Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78/400
Epoch 79/400
Epoch 80/400
Epoch 81/400
Epoch 82/400
Epoch 83/400
Epoch 84/400
Epoch 85/400
Epoch 86/400
Epoch 87/400
Epoch 88/400
Epoch 89/400
Epoch 90/400
Epoch 91/400
Epoch 92/400
Epoch 93/400
Epoch 94/400
Epoch 95/400
Epoch 96/400
Epoch 97/400
Epoch 98/400
Epoch 99/400
Epoch 100/400
Epoch 101/400
Epoch 102/400
Epoch 103/400
Epoch 104/400
Epoch 105/400
Epoch 106/400
Epoch 107/400
Epoch 108/400
Epoch 109/400
Epoch 110/400
Epoch 111/400
Epoch 112/400
Epoch 113/400
Epoch 114/400


Epoch 115/400
Epoch 116/400
Epoch 117/400
Epoch 118/400
Epoch 119/400
Epoch 120/400
Epoch 121/400
Epoch 122/400
Epoch 123/400
Epoch 124/400
Epoch 125/400
Epoch 126/400
Epoch 127/400
Epoch 128/400
Epoch 129/400
Epoch 130/400
Epoch 131/400
Epoch 132/400
Epoch 133/400
Epoch 134/400
Epoch 135/400
Epoch 136/400
Epoch 137/400
Epoch 138/400
Epoch 139/400
Epoch 140/400
Epoch 141/400
Epoch 142/400
Epoch 143/400
Epoch 144/400
Epoch 145/400
Epoch 146/400
Epoch 147/400
Epoch 148/400
Epoch 149/400
Epoch 150/400
Epoch 151/400
Epoch 152/400
Epoch 153/400
Epoch 154/400
Epoch 155/400
Epoch 156/400
Epoch 157/400
Epoch 158/400
Epoch 159/400
Epoch 160/400
Epoch 161/400
Epoch 162/400
Epoch 163/400
Epoch 164/400
Epoch 165/400
Epoch 166/400
Epoch 167/400
Epoch 168/400
Epoch 169/400
Epoch 170/400
Epoch 171/400
Epoch 172/400
Epoch 173/400
Epoch 174/400
Epoch 175/400
Epoch 176/400
Epoch 177/400
Epoch 178/400
Epoch 179/400
Epoch 180/400
Epoch 181/400
Epoch 182/400
Epoch 183/400
Epoch 184/400
Epoch 185/400
Epoch 

Epoch 228/400
Epoch 229/400
Epoch 230/400
Epoch 231/400
Epoch 232/400
Epoch 233/400
Epoch 234/400
Epoch 235/400
Epoch 236/400
Epoch 237/400
Epoch 238/400
Epoch 239/400
Epoch 240/400
Epoch 241/400
Epoch 242/400
Epoch 243/400
Epoch 244/400
Epoch 245/400
Epoch 246/400
Epoch 247/400
Epoch 248/400
Epoch 249/400
Epoch 250/400
Epoch 251/400
Epoch 252/400
Epoch 253/400
Epoch 254/400
Epoch 255/400
Epoch 256/400
Epoch 257/400
Epoch 258/400
Epoch 259/400
Epoch 260/400
Epoch 261/400
Epoch 262/400
Epoch 263/400
Epoch 264/400
Epoch 265/400
Epoch 266/400
Epoch 267/400
Epoch 268/400
Epoch 269/400
Epoch 270/400
Epoch 271/400
Epoch 272/400
Epoch 273/400
Epoch 274/400
Epoch 275/400
Epoch 276/400
Epoch 277/400
Epoch 278/400
Epoch 279/400
Epoch 280/400
Epoch 281/400
Epoch 282/400
Epoch 283/400
Epoch 284/400
Epoch 285/400
Epoch 286/400
Epoch 287/400
Epoch 288/400
Epoch 289/400
Epoch 290/400
Epoch 291/400
Epoch 292/400
Epoch 293/400
Epoch 294/400
Epoch 295/400
Epoch 296/400
Epoch 297/400
Epoch 298/400
Epoch 

Epoch 340/400
Epoch 341/400
Epoch 342/400
Epoch 343/400
Epoch 344/400
Epoch 345/400
Epoch 346/400
Epoch 347/400
Epoch 348/400
Epoch 349/400
Epoch 350/400
Epoch 351/400
Epoch 352/400
Epoch 353/400
Epoch 354/400
Epoch 355/400
Epoch 356/400
Epoch 357/400
Epoch 358/400
Epoch 359/400
Epoch 360/400
Epoch 361/400
Epoch 362/400
Epoch 363/400
Epoch 364/400
Epoch 365/400
Epoch 366/400
Epoch 367/400
Epoch 368/400
Epoch 369/400
Epoch 370/400
Epoch 371/400
Epoch 372/400
Epoch 373/400
Epoch 374/400
Epoch 375/400
Epoch 376/400
Epoch 377/400
Epoch 378/400
Epoch 379/400
Epoch 380/400
Epoch 381/400
Epoch 382/400
Epoch 383/400
Epoch 384/400
Epoch 385/400
Epoch 386/400
Epoch 387/400
Epoch 388/400
Epoch 389/400
Epoch 390/400
Epoch 391/400
Epoch 392/400
Epoch 393/400
Epoch 394/400
Epoch 395/400
Epoch 396/400
Epoch 397/400
Epoch 398/400
Epoch 399/400
Epoch 400/400


Trial to predict a section taken out from of X_train to see if results are the same

In [1456]:
# Xnew= X_train[-500:]
# ynew = np.argmax(model.predict(Xnew), axis=-1)

# for i in range(len(Xnew)):
# 	print("Predicted=%s, Actual=%s" % ( ynew[i],Y_traini[-500:][i]))
 
# #print(ynew)