## Imports & Constants

In [50]:
import torch
import evaluate
import numpy as np

from transformers import (
    BertTokenizerFast, 
    BertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)

from dataset_types import ReviewDataSet

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

## Data

In [51]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [52]:
train, dev, test = dataset.as_train_dev_test_dfs("polarity", 0.3)
train.head()

Unnamed: 0,X,y
3937,"This, the direct-to-video death rattle of the ...",0
1961,In the Hollywood west those trail hands were a...,1
2490,This Italian film from the '70's is NOT even i...,0
331,When seeing this movie you should take notice ...,1
1320,I'm not a follower of a certain movie genre. I...,1


Save the train, dev, test sets as CSV files [here](./data/bert).

In [4]:
dataset.to_csv_as_train_dev_test_sets("./data/bert/", "polarity", 0.3)

## Preprocess

In [53]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

Passing `padding=True` and `truncation=True` ensures that all of our sequences are padded to the same length and are truncated to be no longer than the model's maximum input length.

In [54]:
train_encodings = tokenizer(train.X.to_list(), truncation=True, padding=True)
dev_encodings = tokenizer(dev.X.to_list(), truncation=True, padding=True)
test_encodings = tokenizer(test.X.to_list(), truncation=True, padding=True)

## Torch Dataset

In [55]:
class TorchReviewDataSet(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item
    
    def __len__(self):
        return len(self.labels)

In [56]:
train_dataset = TorchReviewDataSet(train_encodings, train.y.to_list())
dev_dataset = TorchReviewDataSet(dev_encodings, dev.y.to_list())
test_dataset = TorchReviewDataSet(test_encodings, test.y.to_list())

## Evaluation metrics

In [57]:
accuracy = evaluate.load("accuracy")

In [58]:
def compute_metrics(eval_pred: tuple):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Fine-tuning with Trainer

In [59]:
# training_args = TrainingArguments(
#     output_dir = "./fine_tuning/",
#     num_train_epochs = 3,
#     per_device_train_batch_size = 16,
#     per_device_eval_batch_size = 64,
#     warmup_steps = 500,               # Number of warmup steps for the learning-rate scheduler
#     weight_decay = 0.01,              # Strength of weight decay
#     logging_dir = "./logs/",
#     logging_steps = 10,
# )

# training_args = TrainingArguments(
#     output_dir="./fine_tuning/",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

training_args = TrainingArguments(
    output_dir="./fine_tuning_uncased/",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [60]:
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
trainer = Trainer(
    model=bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

In [62]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
50,No log,0.466547,0.79
100,No log,0.281985,0.903333
150,No log,0.292123,0.898333
200,No log,0.301688,0.91
250,No log,0.282471,0.903333
300,No log,0.316315,0.906667
350,No log,0.315991,0.91
400,No log,0.363887,0.903333
450,No log,0.330013,0.923333
500,0.243400,0.329493,0.921667


TrainOutput(global_step=525, training_loss=0.23787961959838866, metrics={'train_runtime': 131.6724, 'train_samples_per_second': 63.795, 'train_steps_per_second': 3.987, 'total_flos': 2210132865024000.0, 'train_loss': 0.23787961959838866, 'epoch': 3.0})

In [63]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[-2.3571064,  3.157359 ],
       [-2.3348513,  3.2161546],
       [ 1.285651 , -1.2123792],
       ...,
       [-2.3329377,  3.06709  ],
       [-1.1530583,  1.7550759],
       [-2.494586 ,  3.1649556]], dtype=float32), label_ids=array([1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
    