In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification 
from models import RobertaForTokenClassification, MeanRobertaForSequenceClassification

In [2]:
model = RobertaForTokenClassification.from_pretrained("roberta-base")
seq_model = MeanRobertaForSequenceClassification.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [3]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")


In [32]:
# generate random input sequence, tokenize and pass to model
input_sequence = ["This is a test sequence rabble cheese over there", "random shit"]
input_ids = tokenizer(input_sequence, return_tensors="pt", padding = True)
output = model(**input_ids, output_hidden_states=True)

Outputs are: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0583,  0.0658, -0.0329,  ..., -0.0519, -0.0692, -0.0017],
         [ 0.1603,  0.1393, -0.0063,  ...,  0.3695, -0.1458,  0.1275],
         [ 0.2237,  0.1394,  0.1266,  ..., -0.1773, -0.0596,  0.1509],
         ...,
         [-0.0317,  0.0501,  0.1322,  ..., -0.0249, -0.2542,  0.0774],
         [-0.0871, -0.1530,  0.0140,  ...,  0.1765, -0.1863,  0.2736],
         [-0.0535,  0.0592, -0.0587,  ..., -0.0928, -0.0680, -0.0253]],

        [[-0.0557,  0.0857, -0.0264,  ..., -0.0760, -0.0757, -0.0243],
         [-0.1111, -0.1342, -0.0781,  ..., -0.1758,  0.0368, -0.0319],
         [ 0.0406,  0.0795, -0.0182,  ..., -0.4414, -0.1000, -0.1966],
         ...,
         [-0.0327,  0.0106,  0.0424,  ...,  0.0111, -0.0658,  0.0017],
         [-0.0327,  0.0106,  0.0424,  ...,  0.0111, -0.0658,  0.0017],
         [-0.0327,  0.0106,  0.0424,  ...,  0.0111, -0.0658,  0.0017]]],
       grad_fn=<NativeLayerNormBackward0

In [5]:
# and pass to sequence model
output2 = seq_model(**input_ids)

Sequence output shape is: torch.Size([1, 768])


In [6]:
output2

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2190, -0.0640]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [7]:
model

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [56]:
# create a linear layer 

linear_layer = torch.nn.Linear(768, 2, bias=False)

In [57]:
linear_output = linear_layer(output.hidden_states[0])

In [58]:
linear_output

tensor([[[-0.0232,  0.0685],
         [ 0.1119,  0.1944],
         [ 0.1769,  0.3318],
         [ 0.0780, -0.0310],
         [-0.1006, -0.1217],
         [-0.0700,  0.3111],
         [ 0.1485,  0.3663],
         [-0.2116,  0.3100],
         [ 0.2791,  0.1702],
         [ 0.2133,  0.1323],
         [ 0.2286,  0.1328],
         [ 0.4357,  0.1773]],

        [[-0.0232,  0.0685],
         [-0.0553,  0.2431],
         [ 0.1930,  0.1947],
         [ 0.3234,  0.1710],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596]]], grad_fn=<UnsafeViewBackward0>)

In [59]:
# pass single batch alone
single_linear_output = linear_layer(output.hidden_states[0][0])

In [60]:
single_linear_output

tensor([[-0.0232,  0.0685],
        [ 0.1119,  0.1944],
        [ 0.1769,  0.3318],
        [ 0.0780, -0.0310],
        [-0.1006, -0.1217],
        [-0.0700,  0.3111],
        [ 0.1485,  0.3663],
        [-0.2116,  0.3100],
        [ 0.2791,  0.1702],
        [ 0.2133,  0.1323],
        [ 0.2286,  0.1328],
        [ 0.4357,  0.1773]], grad_fn=<MmBackward0>)

In [61]:
# decompose the linear layer by doing matmul 
linear_layer.weight.data.shape

torch.Size([2, 768])

In [62]:
output.hidden_states[0].shape

torch.Size([2, 12, 768])

In [63]:
# matmul hidden states with linear layer weights and add bias
output.hidden_states[0] @ (linear_layer.weight.data).T

tensor([[[-0.0232,  0.0685],
         [ 0.1119,  0.1944],
         [ 0.1769,  0.3318],
         [ 0.0780, -0.0310],
         [-0.1006, -0.1217],
         [-0.0700,  0.3111],
         [ 0.1485,  0.3663],
         [-0.2116,  0.3100],
         [ 0.2791,  0.1702],
         [ 0.2133,  0.1323],
         [ 0.2286,  0.1328],
         [ 0.4357,  0.1773]],

        [[-0.0232,  0.0685],
         [-0.0553,  0.2431],
         [ 0.1930,  0.1947],
         [ 0.3234,  0.1710],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596],
         [-0.1982,  0.0596]]], grad_fn=<UnsafeViewBackward0>)