In [3]:
import datasets
import sys

sys.path.append('../../../')
from transformers_modified.src.transformers.models.bert.tokenization_bert import BertTokenizer
from transformers_modified.src.transformers.models.bert.modeling_bert import BertForSequenceClassification

# from transformers_modified.models.bert.tokenization_bert import BertTokenizer
# BertModelForSequenceClassification
import torch
import numpy as np

In [4]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding


In [5]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=3e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.1,
   # load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    eval_steps=200,
    seed=42,
    save_strategy = "steps",
    save_total_limit=5,
    logging_strategy="steps",
    report_to="all",
    logging_steps=200
)


In [10]:
train_set = datasets.load_dataset('sst2', split='train').remove_columns(['idx'])
val_set = datasets.load_dataset('sst2', split='validation').remove_columns(['idx'])

In [11]:
dynamic_padding = True

def tokenize_func(examples):
	return tokenizer(examples["sentence"], truncation=True)  # max_length=512,  padding=True

encoded_dataset_train = train_set.map(tokenize_func, batched=True)
encoded_dataset_test = train_set.map(tokenize_func, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

In [12]:
n_tokens = [len(encoding) for encoding in encoded_dataset_train["input_ids"]]

In [13]:
np.mean(n_tokens), np.sum(n_tokens)

(13.952649631026445, 939697)

In [14]:
encoded_dataset_train[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'input_ids': [101, 4750, 1207, 3318, 5266, 1121, 1103, 22467, 2338, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
metric = datasets.load_metric('accuracy')

  metric = datasets.load_metric('accuracy')


In [16]:
def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [50]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
20,0.2745,0.282834,0.887289
40,0.2917,0.251805,0.902538


    There is an imbalance between your GPUs. You may want to exclude GPU 2 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


KeyboardInterrupt: 

In [21]:
out = model(**tokenized, labels=torch.tensor(data[0]['label']))

In [22]:
out

SequenceClassifierOutput(loss=tensor(0.5901, grad_fn=<NllLossBackward0>), logits=tensor([[-0.2105, -0.4284]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [16]:
?model.forward

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mforward[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0minput_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattention_mask[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtoken_type_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mposition_ids[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhead_mask[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    