In [1]:
# This cell makes sure modules are auto-loaded when you change external python files
%load_ext autoreload
%autoreload 2

In [30]:
import torch

from datasets import load_metric
from preprocessing import get_sst2_dataset, get_nli_dataset, get_ner_dataset, get_squad_dataset
from torch import optim
from transformers import AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer

In [6]:
seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x7fa9151aab10>

In [7]:
if torch.cuda.is_available():
  print("Good to go!")
else:
  print("Please connect to a GPU to run this notebook.")

Please connect to a GPU to run this notebook.


In [48]:
def retrieve_model(fine_tunning_task, model_name="roberta-base"):
    if fine_tunning_task == "sa":
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        dataset, encoded_dataset = get_sst2_dataset(tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    elif fine_tunning_task == "nli":
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        dataset, encoded_dataset = get_nli_dataset(tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    elif fine_tunning_task == "ner":
        tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
        dataset, encoded_dataset = get_ner_dataset(tokenizer)
        label_list = dataset["train"].features["ner_tags"].feature.names
        model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
    elif fine_tunning_task == "qa":
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        dataset, encoded_dataset = get_squad_dataset(tokenizer)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    else:
        raise ValueError("Invalid task")

    return dataset, encoded_dataset, tokenizer, model

In [None]:
# load_metric("glue", "sst2")

In [49]:
model_name = "roberta-base"
task = "ner"
dataset, encoded_dataset, tokenizer, model = retrieve_model(task, model_name)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [51]:
training_args = TrainingArguments(
    output_dir="{}-finetuned-{}".format(model_name, task),
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    # weight_decay=0.01,
    evaluation_strategy = "epoch", # "no" to avoid evaluation
    save_strategy = "epoch", # "no" to avoid saving
    logging_steps=500,
    report_to="tensorboard",
    logging_dir="./tensorboard/{}-finetuned-{}".format(model_name, task),
)

# Huggingface optimizers: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/optimizer_schedules#optimization
# Huggingface Schedulers: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/optimizer_schedules#schedules
# E.g.
# from transformers import AdamW, get_linear_schedule_with_warmup
# optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
# num_warmup_steps = training_args.num_train_epochs * len(train_dataset) // training_args.per_device_train_batch_size  # Adjust warmup based on your training setup
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=training_args.num_train_epochs * len(train_dataset) // training_args.per_device_train_batch_size)


# PyTorch optimizers: https://pytorch.org/docs/stable/optim.html#algorithms
# PyTorch schedulers: https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
optimizer = optim.SGD(model.parameters(), lr=training_args.learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)


trainer = Trainer(
    model,
    args=training_args,
    optimizers=(optimizer, scheduler), # Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [52]:
trainer.train()
# trainer.save_model("models/{}-finetuned-{}".format(model_name, task))

  0%|          | 0/2195 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# eval_result = trainer.evaluate()