In [3]:
import torch

print(torch.__version__)

1.2.0


In [6]:
from kb.include_all import ModelArchiveFromParams
from kb.knowbert_utils import KnowBertBatchifier
from allennlp.common import Params

import torch

# a pretrained model, e.g. for Wordnet+Wikipedia

WORDNET_ARCHIVE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wordnet_model.tar.gz"
WIKI_ARCHIVE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wiki_model.tar.gz"
WORDNET_WIKI_ARCHIVE = "https://allennlp.s3-us-west-2.amazonaws.com/knowbert/models/knowbert_wiki_wordnet_model.tar.gz"

# load model and batcher
params = Params({"archive_file": WORDNET_ARCHIVE})
model = ModelArchiveFromParams.from_params(params=params)
batcher = KnowBertBatchifier(WORDNET_ARCHIVE)

100%|██████████| 1400916256/1400916256 [01:28<00:00, 15918868.01B/s]
100%|██████████| 563648/563648 [00:00<00:00, 662252.90B/s]


In [63]:
sentences = ["Paris is located in France.", "Michael Jackson is a great singer"]
# batcher takes raw untokenized sentences
# and yields batches of tensors needed to run KnowBert
for i,batch in enumerate(batcher.iter_batches(sentences, verbose=True)):
    print(f'Loop {i}')
    print(f"Batch: {batch.keys()}") #Batch contains {tokens,segment_ids,candidates}
    #tokens: Tensor of tokens indices (used to idx an embedding) => because a batch contains multiple
    #sentences with varying # of tokens, all tokens tensors are padded with zeros 
    #shape: (batch_size (#sentences), max_seq_len)
    #print(batch['tokens'])#dict with only 'tokens'
    print(f"Tokens shape {batch['tokens']['tokens'].shape}")
    #Defines the segments_ids (0 for first segment and 1 for second), can be used for NSP
    #shape: (batch_size,max_seq_len)
    print(f"Segment ids shape: {batch['segment_ids'].shape}")

    #Dict with only wordnet
    #Candidates: stores for multiple knowledge base, the entities detected using this knowledge base
    wordnet_kb = batch['candidates']['wordnet']
    print(f"Wordnet kb: {wordnet_kb.keys()}")

    
    #Stores for each detected entities, a list of candidate KB entities that correspond to it
    #Priors: correctness probabilities estimated by the entity linker (sum to 1 (or 0 if padding) on axis 2)
    #Adds 0 padding to axis 1 when there is less detected entities in the sentence than in the max sentence
    #Adds 0 padding to axis 2 when there is less detected KB entities for an entity in the sentence than in the max candidate KB entities entity
    #shape:(batch_size, max # detected entities, max # KB candidate entities)
    print(f"Candidate entity_priors shape: {wordnet_kb['candidate_entity_priors'].shape}")
    #Ids of the KB candidate entities + 0 padding on axis 1 or 2 if necessary
    #shape: (batch_size, max # detected entities, max # KB candidate entities)
    print(f"Candidate entities ids: {wordnet_kb['candidate_entities']['ids'].shape}")
    #Spans of which sequence of tokens correspond to an entity in the sentence, eg: [1,2] for Michael Jackson (both bounds are included)
    #Padding with [-1,-1] when no more detected entities
    print(wordnet_kb['candidate_spans'])

    #shape: (batch_size, max # detected entities, 2)
    print(f"Candidate span shape: {wordnet_kb['candidate_spans'].shape}")

    #For each sentence entity, indicate to which segment ids it corresponds to
    #shape: (batch_size, max # detected entities)
    print(f"Candidate segments_ids shape: {wordnet_kb['candidate_segment_ids']}")

    #**batch sends 
    model_output = model(**batch)
    #print(model_output.keys())
    #print(model_output['wordnet'])
    #print(model_output['loss'])
    #print(model_output['pooled_output'].shape)
    # model_output['contextual_embeddings'] is (batch_size, seq_len, embed_dim) tensor of top layer activations
    #print(model_output['contextual_embeddings'].shape)

Paris is located in France.
['[CLS]', 'paris', 'is', 'located', 'in', 'france', '.', '[SEP]']
Michael Jackson is a great singer
['[CLS]', 'michael', 'jackson', 'is', 'a', 'great', 'singer', '[SEP]']
Loop 0
Batch: dict_keys(['tokens', 'segment_ids', 'candidates'])
Tokens shape torch.Size([2, 8])
Segment ids shape: torch.Size([2, 8])
Wordnet kb: dict_keys(['candidate_entity_priors', 'candidate_entities', 'candidate_spans', 'candidate_segment_ids'])
Candidate entity_priors shape: torch.Size([2, 7, 14])
Candidate entities ids: torch.Size([2, 7, 14])
tensor([[[ 1,  1],
         [ 2,  2],
         [ 3,  3],
         [ 4,  4],
         [ 5,  5],
         [-1, -1],
         [-1, -1]],

        [[ 1,  1],
         [ 1,  2],
         [ 2,  2],
         [ 3,  3],
         [ 4,  4],
         [ 5,  5],
         [ 6,  6]]])
Candidate span shape: torch.Size([2, 7, 2])
Candidate segments_ids shape: tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]])


TypeError: 'tokens' is an invalid keyword argument for this function