# BERT finetuning on SST-2

## Librairy

In [1]:
import os
import time
import pickle

import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from transformers import BertTokenizer, BertTokenizerFast
from transformers import BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from transformers.data.data_collator import DataCollatorWithPadding

from datasets import load_dataset, Dataset, concatenate_datasets

In [2]:
# print(torch.__version__)
# print(torch.cuda.device_count())
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0))

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# if torch.cuda.is_available():
#     torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [3]:
device

device(type='cuda')

## Global variables

In [4]:
BATCH_SIZE = 24
NB_EPOCHS = 4

In [5]:
RESULTS_FILE = '~/Results/BERT_finetune/sst-2_BERT_finetune_b'+str(BATCH_SIZE)+'results.pkl'
RESULTS_PATH = '~/Results/BERT_finetune/sst-2_b'+str(BATCH_SIZE)+'/'
CACHE_DIR = '~/Data/huggignface/'         # path of your  folder

## Dataset

In [6]:
# download dataset

# train set
raw_datasets = load_dataset('glue', 'sst2', cache_dir=CACHE_DIR)
raw_datasets = raw_datasets.rename_column('sentence', 'text')

# special test set
raw_test_set = load_dataset('gpt3mix/sst2', split='test', cache_dir=CACHE_DIR)

def clean(example):
    example['text'] = example['text'].replace('-LRB-', '(').replace('-RRB-', ')').replace(r'\/', r'/')
    example['label'] = np.abs(example['label'] - 1) # revert labels of test set
    return example

raw_test_set = raw_test_set.map(clean)

raw_datasets['test'] = raw_test_set

Reusing dataset glue (/raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Using custom data configuration default
Reusing dataset ss_t2 (/raid/home/jeremiec/huggingface_datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa)
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa/cache-d185696e69a8fb51.arrow


In [7]:
# tokenize

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
test_dataset = tokenized_datasets["test"].shuffle(seed=42)
val_dataset = tokenized_datasets["validation"].shuffle(seed=42)

Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3cf17b2d6f6fffb9.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-943a96702b1c8364.arrow
Loading cached processed dataset at /raid/home/jeremiec/huggingface_datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa/cache-e58927824044e235.arrow
Loading cached shuffled indices for dataset at /raid/home/jeremiec/huggingface_datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-42563925575e84c2.arrow
Loading cached shuffled indices for dataset at /raid/home/jeremiec/huggingface_datasets/ss_t2/default/0.0.0/90167692658fa4abca2ffa3ede1a43a71e2bf671078c5c275c64c4231d5a62fa/cache-09634d626657f86d.arrow
Loading cached shuffled indices for

In [8]:
# get number of labels

num_labels = len(set(train_dataset['labels'].tolist()))
num_labels

2

## Model

#### Model

In [9]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

#### Training

In [10]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_PATH,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=2e-5,                       # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
#     # eval
    evaluation_strategy="steps",
    eval_steps=50,
#     evaluation_strategy='no', # no more evaluation, takes time
    
    # log
    logging_dir=RESULTS_PATH+'logs',  
    logging_strategy='steps',
    logging_steps=50,
    
    # save
#     save_strategy='epoch',
#     load_best_model_at_end=False
    load_best_model_at_end=True               # cf. paper Sun et al.
)

In [11]:
def compute_metrics(p):
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    
    return {"val_accuracy": accuracy}

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
results = trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
50,0.6994,0.694691,2.6775,325.68
100,0.6934,0.691708,2.6812,325.229
150,0.6795,0.686285,2.5577,340.938
200,0.6801,0.676867,2.5614,340.442
250,0.6604,0.632485,2.5844,337.413
300,0.4903,0.512717,2.5614,340.443
350,0.3743,0.431079,2.5921,336.411
400,0.347,0.391426,2.5863,337.159
450,0.3051,0.323702,2.4906,350.118
500,0.35,0.30166,2.4551,355.178


In [None]:
training_time = results.metrics["train_runtime"]
training_time_per_epoch = training_time / training_args.num_train_epochs
training_time_per_epoch

In [None]:
training_time_per_epoch = 367.8208

In [None]:
# here re-launch from here!!!

In [None]:
trainer.save_model(os.path.join(RESULTS_PATH, 'best_model-0'))

## Results

In [None]:
results_d = {}
epoch = 1

ordered_files = sorted( [f for f in os.listdir(RESULTS_PATH) 
                         if (not f.endswith("logs")) and (f.startswith("best")) # best model eval only
                        ], 
                         key=lambda x: int(x.split('-')[1]) )

for filename in ordered_files:
    
    print(filename)
    
    # load model
    model_file = os.path.join(RESULTS_PATH, filename)
    finetuned_model = BertForSequenceClassification.from_pretrained(model_file, num_labels=num_labels)
    finetuned_model.to(device)
    finetuned_model.eval()
    
    # compute test acc
    test_trainer = Trainer(finetuned_model, data_collator=DataCollatorWithPadding(tokenizer))
    
    raw_preds, labels, _ = test_trainer.predict(test_dataset)
    preds = np.argmax(raw_preds, axis=1)
    print("preds", len(preds), "labels", len(labels))

    test_acc = accuracy_score(y_true=labels, y_pred=preds)
    
#     results_d[filename] = (test_acc, training_time_per_epoch*epoch)
    results_d[filename] = test_acc # best model evaluation only
    
    print((test_acc, training_time_per_epoch*epoch))
    
    epoch += 1
    
results_d['training_time'] = training_time

In [None]:
results_d

In [None]:
# save results

with open(RESULTS_FILE, 'wb') as fh:
    pickle.dump(results_d, fh)

In [24]:
# load results

with open(RESULTS_FILE, 'rb') as fh:
    results_d = pickle.load(fh)

In [25]:
results_d

{'best_model-0': 0.9220208676551346, 'training_time': 4065.5297}

In [26]:
4065.5297 / 3600

1.1293138055555556

In [None]:
# batch size = 512 works! (1024 too large)
# so we took the max