In [1]:
from importlib.resources import files

import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import load_dataset

from aml_wa24 import models

In [None]:
model_path = str(files(models).joinpath("paraphrase-MiniLM-L3-v2"))

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

In [3]:
dataset = load_dataset("sst2", split="train")

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [5]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets.shuffle(seed=43).select(range(100))

In [6]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)
    return {"acc": np.mean(preds == labels)}

training_args = TrainingArguments(
    output_dir="test_trainer", 
    save_strategy="no", 
    eval_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# Task 1
# Write a method that takes a text and makes a prediction

In [None]:
# Look into the parameters id2label and label2id for the from_pretrained method

In [None]:
# Try out a different dataset

In [8]:
# Run the code on colab, on a gpu

In [None]:
# Try out a larger model like "google-bert/bert-base-uncased"