In [4]:
import datasets
import sys

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

sys.path.append('../../../')
from transformers_modified.src.transformers.models.bert.tokenization_bert import BertTokenizer
from transformers_modified.src.transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# from transformers_modified.models.bert.tokenization_bert import BertTokenizer
# BertModelForSequenceClassification
import torch
import numpy as np

In [None]:
# !pip install -U tokenizers
# !pip install -U transformers

In [6]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [13]:
def create_data(tokenizer):
    train_set = datasets.load_dataset('sst2', split='train').remove_columns(['idx'])
    val_set = datasets.load_dataset('sst2', split='validation').remove_columns(['idx'])

    dynamic_padding = True

    def tokenize_func(examples):
        return tokenizer(examples["sentence"], truncation=True)  # max_length=512,  padding=True

    encoded_dataset_train = train_set.map(tokenize_func, batched=True)
    encoded_dataset_test = train_set.map(tokenize_func, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer)

    return encoded_dataset_train, encoded_dataset_test, data_collator

  metric = datasets.load_metric('accuracy')


In [None]:
encoded_dataset_train, encoded_dataset_test, data_collator = create_data(tokenizer)

In [14]:
metric = datasets.load_metric('accuracy')

def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=3e-3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.1,
   # load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    eval_steps=500,
    seed=42,
    save_strategy = "steps",
    save_total_limit=5,
    logging_strategy="steps",
    report_to="all",
    logging_steps=200
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [21]:
for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

In [22]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
200,0.6606,0.623661,0.677976


KeyboardInterrupt: 

In [21]:
out = model(**tokenized, labels=torch.tensor(data[0]['label']))

In [22]:
out

SequenceClassifierOutput(loss=tensor(0.5901, grad_fn=<NllLossBackward0>), logits=tensor([[-0.2105, -0.4284]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [16]:
?model.forward

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minput_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattention_mask[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_type_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mposition_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhead_mask[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    