In [1]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from transformers import AutoModel

In [4]:
bert_path = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_path)
model = BertModel.from_pretrained(bert_path)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs, head_mask=torch.tensor([1 for i in range(model.config.num_attention_heads)]))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
from transformers import BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

SequenceClassifierOutput(loss=tensor(0.7082, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0025, -0.0324]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [91]:
class BertForClassification(nn.Module):
    def __init__(self):
        super(BertForClassification, self).__init__()
        self.backbone = BertModel.from_pretrained("bert-base-uncased")
        self.num_heads = self.backbone.config.num_attention_heads
        for p in self.parameters():
            p.requires_grad = False  # freeze the backbone model
        self.linear1 = nn.Linear(768, 256)
        self.dropout = nn.Dropout(0.2)
        self.linear2 = nn.Linear(256, 2)  # 2 is the number of classes in this example
        self.__hidden_states__()

    def forward(self, input_ids, attention_mask):
        head_mask = [0 for _ in range(self.num_heads)]
        head_mask_list = []
        for i in range(len(head_mask)):
            head_mask_n = head_mask[:]
            head_mask_n[i] = 1
            head_mask_list.append(head_mask_n)
        head_mask_list = torch.tensor(head_mask_list)
        head_hidden_states = torch.rand(self.num_heads, attention_mask.shape[0],attention_mask.shape[1], 2)
        for i in range(len(head_mask_list)):
            backbone_n = self.backbone(input_ids, attention_mask=attention_mask, head_mask=head_mask_list[i])
            l1 = self.linear1(backbone_n.last_hidden_state)
            dropout = self.dropout(l1)
            l2 = self.linear2(dropout)
            head_hidden_states[i] = l2
        return head_hidden_states


    def __hidden_states__(self):
        backbone = self.backbone(torch.tensor([[1, 1]]), torch.tensor([[1, 1]]), output_hidden_states=True)
        self.hidden_states = backbone.hidden_states

In [92]:
model_x = BertForClassification()
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
x = model_x(inputs["input_ids"],inputs["attention_mask"])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [96]:
x[0][0][-1].argmax().squeeze()

tensor(1)

In [100]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

labels.squeeze(0).squeeze(0)

tensor(1)

In [38]:
print(outputs[0].shape)
probs = outputs[0]
print(torch.argmax(probs,-1))

torch.Size([1, 8, 768])
tensor([[205, 115, 300, 229,  48, 242, 115, 308]])


In [31]:
# layer-wise

In [33]:
last_hidden_states = outputs.last_hidden_state
last_hidden_states

tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
         [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
         [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
         ...,
         [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
         [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
         [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
       grad_fn=<NativeLayerNormBackward0>)

In [52]:
hidden_states = outputs.hidden_states
hidden_states[-1]

tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
         [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
         [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
         ...,
         [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
         [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
         [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
       grad_fn=<NativeLayerNormBackward0>)

In [53]:
layers = []
layers.append(hidden_states[i] for i in range(len(hidden_states)))
print(layers)

[<generator object <genexpr> at 0x0000018209B1AE40>]


In [25]:
# head-wise

In [47]:
head_mask = [0 for _ in range(model.config.num_attention_heads)]
head_mask_list = []
for i in range(len(head_mask)):
    head_mask_n = head_mask[:]
    head_mask_n[i] = 1
    head_mask_list.append(head_mask_n)
head_mask_list = torch.tensor(head_mask_list)
head_mask_list

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [90]:
class BertForClassification(nn.Module):
    def __init__(self):
        super(BertForClassification, self).__init__()
        self.backbone = AutoModel.from_pretrained("../../module/bert-base-uncased")
        for p in self.parameters():
            p.requires_grad = False  # freeze the backbone model
        self.linear1 = nn.Linear(768, 256)
        self.dropout = nn.Dropout(0.5)
        self.linear2 = nn.Linear(256, 2)  # 2 is the number of classes in this example

    def forward(self, input_ids, attention_mask):
        backbone = self.backbone(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        # backbone has the following shape: (batch_size, sequence_length, 768)

        # layer-wise
        layers = list(backbone.hidden_states)
        for i in range(len(backbone.hidden_states)):
            l1 = self.linear1(backbone.hidden_states[i])
            dropout = self.dropout(l1)
            l2 = self.linear2(dropout)
            layers[i] = l2
        return layers

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
model = BertForClassification().forward(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
print(model)

Some weights of the model checkpoint at ../../module/bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[tensor([[[-0.5210,  0.0122],
         [ 0.1717,  0.1579],
         [ 0.2217,  0.1842],
         [ 0.4237, -0.0129],
         [ 0.3135,  0.8151],
         [ 0.1173,  0.1596],
         [ 0.0501,  0.0714],
         [-0.1485,  0.2781]]], grad_fn=<AddBackward0>), tensor([[[ 0.1059, -0.0953],
         [ 0.2949,  0.2448],
         [ 0.1185, -0.0947],
         [ 0.8020, -0.0554],
         [ 0.1865,  0.2868],
         [ 0.0678, -0.0389],
         [ 0.5599,  0.2315],
         [-0.0388,  0.0898]]], grad_fn=<AddBackward0>), tensor([[[-0.2490, -0.1227],
         [ 0.8054, -0.1821],
         [ 0.3046, -0.3631],
         [ 0.7043,  0.2669],
         [ 0.3155,  0.6857],
         [ 0.4692,  0.0138],
         [-0.0097, -0.1606],
         [ 0.0105,  0.0089]]], grad_fn=<AddBackward0>), tensor([[[ 0.1191, -0.1842],
         [ 1.0479,  0.2365],
         [ 0.4248, -0.1409],
         [ 0.0393, -0.1585],
         [ 0.0070,  0.7503],
         [ 0.3951, -0.0212],
         [ 0.2930, -0.2872],
         [-0.2793, 

In [98]:
from tqdm import tqdm
import time
pbar = tqdm([f"Layer-{i}" for i in range(5)])


for i in tqdm(range(100)):
    time.sleep(1)
    tqdm.set_description("Processing %s"%i)

70


 72%|███████▏  | 72/100 [01:12<00:28,  1.01s/it]

71


 73%|███████▎  | 73/100 [01:13<00:27,  1.01s/it]

72


 74%|███████▍  | 74/100 [01:14<00:26,  1.01s/it]

73


 75%|███████▌  | 75/100 [01:15<00:25,  1.01s/it]

74


 76%|███████▌  | 76/100 [01:16<00:24,  1.01s/it]

75


 77%|███████▋  | 77/100 [01:17<00:23,  1.01s/it]

76


 78%|███████▊  | 78/100 [01:18<00:22,  1.01s/it]

77


 79%|███████▉  | 79/100 [01:20<00:21,  1.01s/it]

78


 80%|████████  | 80/100 [01:21<00:20,  1.01s/it]

79


 81%|████████  | 81/100 [01:22<00:19,  1.01s/it]

80


 82%|████████▏ | 82/100 [01:23<00:18,  1.01s/it]

81


 83%|████████▎ | 83/100 [01:24<00:17,  1.01s/it]

82


 84%|████████▍ | 84/100 [01:25<00:16,  1.01s/it]

83


 85%|████████▌ | 85/100 [01:26<00:15,  1.01s/it]

84


 86%|████████▌ | 86/100 [01:27<00:14,  1.01s/it]

85


 87%|████████▋ | 87/100 [01:28<00:13,  1.01s/it]

86


 88%|████████▊ | 88/100 [01:29<00:12,  1.01s/it]

87


 89%|████████▉ | 89/100 [01:30<00:11,  1.01s/it]

88


 90%|█████████ | 90/100 [01:31<00:10,  1.01s/it]

89


 91%|█████████ | 91/100 [01:32<00:09,  1.01s/it]

90


 92%|█████████▏| 92/100 [01:33<00:08,  1.01s/it]

91


 93%|█████████▎| 93/100 [01:34<00:07,  1.01s/it]

92


 94%|█████████▍| 94/100 [01:35<00:06,  1.01s/it]

93


 95%|█████████▌| 95/100 [01:36<00:05,  1.01s/it]

94


 96%|█████████▌| 96/100 [01:37<00:04,  1.01s/it]

95


 97%|█████████▋| 97/100 [01:38<00:03,  1.01s/it]

96


 98%|█████████▊| 98/100 [01:39<00:02,  1.01s/it]

97


 99%|█████████▉| 99/100 [01:40<00:01,  1.01s/it]

98


100%|██████████| 100/100 [01:41<00:00,  1.01s/it]

99



  0%|          | 0/5 [01:41<?, ?it/s]

  0%|          | 0/100 [00:01<?, ?it/s][A


AttributeError: 'str' object has no attribute 'desc'