In [None]:
import torch

from datasets import load_dataset, load_metric
from torch import optim
from transformers import AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

In [None]:
seed = 42
torch.manual_seed(seed)

In [None]:
if torch.cuda.is_available():
  print("Good to go!")
else:
  print("Please set GPU")

In [65]:
def prepare_mnli_dataset(dataset):
    dataset["validation"], dataset["test"] = dataset.pop("validation_matched"), dataset.pop("test_matched")
    dataset.pop("validation_unmatched")
    dataset.pop("test_unmatched")
    return dataset

def retrieve_model(fine_tunning_task, model_name="roberta-base"):
    if fine_tunning_task == "sa":
        dataset = load_dataset("glue", "sst2")
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    elif fine_tunning_task == "mnli":
        dataset = load_dataset("glue", "mnli")
        dataset = prepare_mnli_dataset(dataset)
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    elif fine_tunning_task == "ner":
        dataset = None
        tokenizer = None
        model = None
    elif fine_tunning_task == "qa":
        dataset = load_dataset("squad_v2")
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name, num_labels=2)
    else:
        raise ValueError("Invalid task")

    return dataset, tokenizer, model

In [66]:
model_name = "roberta-base"
task = "qa"
dataset, tokenizer, model = retrieve_model(task, model_name)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocessing(examples):
    task_to_keys = {
        "sa": ("sentence", None),
        "mnli": ("premise", "hypothesis"),
        "ner": (None, None),
        "qa": (None, None),
    }
    sentence1_key, sentence2_key = task_to_keys[task]

    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

encoded_dataset = dataset.map(preprocessing, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="{}-finetuned-{}".format(model_name, task),
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    evaluation_strategy = "epoch", # "no" to avoid evaluation
    save_strategy = "epoch", # "no" to avoid saving
    logging_steps=500,
    report_to="tensorboard",
    logging_dir="./tensorboard/{}-finetuned-{}".format(model_name, task),
)

# Huggingface optimizers: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/optimizer_schedules#optimization
# Huggingface Schedulers: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/optimizer_schedules#schedules
# E.g.
# from transformers import AdamW, get_linear_schedule_with_warmup
# optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
# num_warmup_steps = training_args.num_train_epochs * len(train_dataset) // training_args.per_device_train_batch_size  # Adjust warmup based on your training setup
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=training_args.num_train_epochs * len(train_dataset) // training_args.per_device_train_batch_size)


# PyTorch optimizers: https://pytorch.org/docs/stable/optim.html#algorithms
# PyTorch schedulers: https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
optimizer = optim.SGD(model.parameters(), lr=training_args.learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)


trainer = Trainer(
    model,
    args=training_args,
    optimizers=(optimizer, scheduler), # Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()
# trainer.save_model("models/{}-finetuned-{}".format(model_name, task))

In [None]:
eval_result = trainer.evaluate()

## Single Sentence Classification (Sentiment Analysis)

## Sentence Pair Classification Task (Natural Language Inference, aka textual entailment)

## Single Sentence Tagging Task (Name Entity Recognition)

## Question Answering Task