In [52]:
# %%capture 
# !pip install -U tokenizers
# !pip install -U transformers
# !pip install -U tokenizers
# !pip install -U transformers

In [1]:
import datasets
import sys
from tqdm import tqdm
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

sys.path.append('../../../')
from transformers.models.bert.tokenization_bert import BertTokenizer
from transformers.models.bert.modeling_linbert import BertForSequenceClassification, BertForMaskedLM

import torch
import pickle
from fvcore.nn import FlopCountAnalysis

In [2]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling


In [3]:

# from transformers_modified.models.bert.tokenization_bert import BertTokenizer
# BertModelForSequenceClassification
import torch
import copy
import numpy as np

In [4]:
model_id = 'bert-base-cased'

model = BertForMaskedLM.from_pretrained(model_id)
tokenizer = BertTokenizer.from_pretrained(model_id)
seq_class_model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store 

In [5]:
seq_class_model.linearize(128, 32)

linearized


In [6]:
batch = torch.tensor([[0, 15, 3, 4], [0, 1, 1, 2]])

In [7]:
seq_class_model(batch)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=None, logits=tensor([[ 0.6627, -0.1643],
        [ 0.7424, -0.1676]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
device = torch.device("cuda")
seq_class_model.to(device)
dummy_input = torch.zeros(16, 512, dtype=torch.long).to(device)

# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 300
timings=np.zeros((repetitions,1))
#GPU-WARM-UP
for _ in range(10):
    _ = seq_class_model(dummy_input)
# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in tqdm(range(repetitions)):
        starter.record()
        _ = seq_class_model(input_ids=dummy_input)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print('mean {} +- {}'.format(mean_syn, std_syn))
# 8 - 4.645040099620819 16 - 4.87 32 - 5.32

100%|██████████| 300/300 [01:48<00:00,  2.78it/s]

mean 356.131806640625 +- 19.686272189282995





In [5]:
lin_model = copy.deepcopy(seq_class_model)
lin_model.linearize(128, 16)

linearized


In [6]:
device = torch.device("cuda")
lin_model.to(device)
dummy_input = torch.zeros(64, 128, dtype=torch.long).to(device)

# INIT LOGGERS
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = 1000
timings=np.zeros((repetitions,1))
#GPU-WARM-UP
for _ in range(10):
    _ = lin_model(dummy_input)
# MEASURE PERFORMANCE
with torch.no_grad():
    for rep in tqdm(range(repetitions)):
        starter.record()
        _ = lin_model(input_ids=dummy_input)
        ender.record()
        # WAIT FOR GPU SYNC
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[rep] = curr_time

mean_syn = np.sum(timings) / repetitions
std_syn = np.std(timings)
print('mean linearized {} +- {}'.format(mean_syn, std_syn))
# 8 - 4.908345495859782  16 - 4.972362136046092 64 - 6.3
# mean linearized 6.310050270318985 +- 1.5480504170206077
# mean 6.814684094429016 +- 2.0908597950889405

# 128 mean linearized 10.194645232677459 +- 2.1097197328117305
# 128 mean 10.301190483093261 +- 2.14933867617596

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 1000/1000 [04:41<00:00,  3.55it/s]

mean linearized 276.56400245666504 +- 44.34218480368732





In [22]:
def create_data(tokenizer):
    train_set = datasets.load_dataset('sst2', split='train').remove_columns(['idx'])
    val_set = datasets.load_dataset('sst2', split='validation').remove_columns(['idx'])

    dynamic_padding = True

    def tokenize_func(examples):
        return tokenizer(examples["sentence"], max_length=128, truncation=True)

    encoded_dataset_train = train_set.map(tokenize_func, batched=True)
    encoded_dataset_test = val_set.map(tokenize_func, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer)
   # data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)
    return encoded_dataset_train, encoded_dataset_test, data_collator

def create_mlm_data(tokenizer):
    train_set = datasets.load_dataset('imdb', split='train').remove_columns(['label'])
    val_set = datasets.load_dataset('imdb', split='test').remove_columns(['label'])

    def tokenize_func(examples):
        return tokenizer(examples["text"], max_length=128, padding='max_length', truncation=True)

    encoded_dataset_train = train_set.map(tokenize_func, batched=True)
    encoded_dataset_val = val_set.map(tokenize_func, batched=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer)

    return encoded_dataset_train, encoded_dataset_val, data_collator

In [23]:
encoded_dataset_train, encoded_dataset_test, data_collator = create_data(tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [25]:
np.histogram(list(map(len, encoded_dataset_train['input_ids'])))

(array([29978, 17189,  9249,  5375,  3051,  1497,   675,   247,    76,
           12]),
 array([ 3. ,  9.9, 16.8, 23.7, 30.6, 37.5, 44.4, 51.3, 58.2, 65.1, 72. ]))

In [26]:
np.mean(list(map(len, encoded_dataset_train['input_ids'])))

13.952649631026445

In [5]:
device = 'cuda'
seq_len = 128
k = 32
bs = 1

seq_class_model = seq_class_model.to(device)
dummy_input = torch.ones(bs, seq_len, dtype=torch.long).to(device)

default_flops = FlopCountAnalysis(seq_class_model, dummy_input) 
print(default_flops.total())

lin_model = copy.deepcopy(seq_class_model)
lin_model.linearize(seq_len, k)

lin_model = lin_model.to(device)
lin = FlopCountAnalysis(lin_model, dummy_input)
print(lin.total())

print(default_flops.total() - lin.total())

Unsupported operator aten::add encountered 27 time(s)
Unsupported operator aten::rsub encountered 1 time(s)
Unsupported operator aten::mul encountered 1 time(s)
Unsupported operator aten::embedding encountered 3 time(s)
Unsupported operator aten::add_ encountered 1 time(s)
Unsupported operator aten::masked_fill_ encountered 24 time(s)
Unsupported operator aten::div encountered 12 time(s)
Unsupported operator aten::softmax encountered 12 time(s)
Unsupported operator aten::gelu encountered 12 time(s)
Unsupported operator aten::tanh encountered 1 time(s)


11186505216
linearized


Unsupported operator aten::add encountered 27 time(s)
Unsupported operator aten::rsub encountered 1 time(s)
Unsupported operator aten::mul encountered 1 time(s)
Unsupported operator aten::embedding encountered 3 time(s)
Unsupported operator aten::add_ encountered 1 time(s)
Unsupported operator aten::masked_fill_ encountered 24 time(s)
Unsupported operator aten::div encountered 12 time(s)
Unsupported operator aten::softmax encountered 12 time(s)
Unsupported operator aten::gelu encountered 12 time(s)
Unsupported operator aten::tanh encountered 1 time(s)


11035510272
150994944


In [6]:
11186505216 / 11035510272

1.0136826426942045

In [34]:
flops.total()

Unsupported operator aten::add encountered 27 time(s)
Unsupported operator aten::rsub encountered 1 time(s)
Unsupported operator aten::mul encountered 1 time(s)
Unsupported operator aten::embedding encountered 3 time(s)
Unsupported operator aten::add_ encountered 1 time(s)
Unsupported operator aten::masked_fill_ encountered 24 time(s)
Unsupported operator aten::div encountered 12 time(s)
Unsupported operator aten::softmax encountered 12 time(s)
Unsupported operator aten::gelu encountered 12 time(s)
Unsupported operator aten::tanh encountered 1 time(s)


773890007040

In [8]:
metric = datasets.load_metric('accuracy')

def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return metric.compute(predictions=predictions, references=labels)

  metric = datasets.load_metric('accuracy')


In [10]:
training_args = TrainingArguments(
    output_dir='./bert-base-cased-sst2',
    learning_rate=3e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    load_best_model_at_end=True,
    num_train_epochs=5,
    weight_decay=0.1,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="epoch",
    seed=42,
    save_strategy = "epoch",
    save_total_limit=5,
    logging_strategy="epoch",
    report_to="all",
)


trainer = Trainer(
    model=seq_class_model,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2267,0.210278,0.913991
2,0.1091,0.263743,0.917431
3,0.0722,0.267314,0.917431
4,0.0467,0.294708,0.926606
5,0.0298,0.334369,0.920872


TrainOutput(global_step=2635, training_loss=0.09690964642693015, metrics={'train_runtime': 882.1021, 'train_samples_per_second': 381.753, 'train_steps_per_second': 2.987, 'total_flos': 2.21503330843008e+16, 'train_loss': 0.09690964642693015, 'epoch': 5.0})

In [12]:
trainer.evaluate(encoded_dataset_test)

{'eval_loss': 0.21028630435466766,
 'eval_accuracy': 0.9139908256880734,
 'eval_runtime': 0.7942,
 'eval_samples_per_second': 1097.941,
 'eval_steps_per_second': 8.814,
 'epoch': 5.0}

In [13]:
trainer.push_to_hub('bert-base-cased-sst2')

pytorch_model.bin:   0%|          | 0.00/217M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

'https://huggingface.co/VityaVitalich/bert-base-cased-sst2/tree/main/'

In [14]:
tokenizer.push_to_hub('bert-base-cased-sst2')

CommitInfo(commit_url='https://huggingface.co/VityaVitalich/bert-base-cased-sst2/commit/8bca97af90ae393e48881bea8254e4c1b498a540', commit_message='Upload tokenizer', commit_description='', oid='8bca97af90ae393e48881bea8254e4c1b498a540', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
from huggingface_hub import notebook_login

In [31]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
trainer.push_to_hub('bert-tiny-sst2')

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/8.79M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

'https://huggingface.co/VityaVitalich/results/tree/main/'

In [44]:
model_downloaded = BertForSequenceClassification.from_pretrained('VityaVitalich/bert-tiny-sst2', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('VityaVitalich/bert-tiny-sst2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [45]:
trainer = Trainer(
    model=model_downloaded,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [46]:
trainer.evaluate(encoded_dataset_test)

{'eval_loss': 0.4771495461463928,
 'eval_accuracy': 0.8279816513761468,
 'eval_runtime': 0.2513,
 'eval_samples_per_second': 3469.28,
 'eval_steps_per_second': 27.85}

In [18]:
seq_class_model.bert = trainer.model.bert

In [24]:
seq_class_model.bert.encoder.layer[0].attention.self.E

Parameter containing:
tensor([[ 0.0107,  0.0228,  0.1497,  ...,  0.0589,  0.1436,  0.0942],
        [ 0.0464,  0.0290, -0.0227,  ...,  0.0387,  0.1120,  0.0134],
        [-0.0765, -0.1274,  0.1461,  ...,  0.0350,  0.0740,  0.0756],
        ...,
        [ 0.1165, -0.1550, -0.0914,  ..., -0.0647, -0.0303,  0.0779],
        [-0.0241,  0.1494,  0.1148,  ..., -0.0482,  0.1539, -0.0846],
        [-0.0982, -0.0761,  0.0075,  ...,  0.1350, -0.0933,  0.1502]],
       device='cuda:0', requires_grad=True)

In [41]:
tokenizer.push_to_hub('bert-tiny-sst2')

CommitInfo(commit_url='https://huggingface.co/VityaVitalich/bert-tiny-sst2/commit/1536c4b2a95662057c5c34e1b0d3ca835446efa2', commit_message='Upload tokenizer', commit_description='', oid='1536c4b2a95662057c5c34e1b0d3ca835446efa2', pr_url=None, pr_revision=None, pr_num=None)