In [1]:
from importlib.resources import files

import polars as pl
import torch
from torch import nn, optim
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torcheval.metrics.functional import multiclass_accuracy

from aml_wa24 import models

In [2]:
sst2 = load_dataset("sst2", split="train").to_polars()

eval_size = 100
train_size = 1000
assert eval_size + train_size <= len(sst2) # ensure that the training data does not contain evaluation data

sst2 = sst2.sample(fraction=1, shuffle=True, seed=42) # shuffle df
eval_df = sst2.head(eval_size)
train_df = sst2.tail(train_size)

In [None]:
model_path = str(files(models).joinpath("paraphrase-MiniLM-L3-v2"))

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [4]:
batch_size = 4
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = learning_rate)

In [5]:
# Optional Task: Understand what is going on here.
def in_batches(df: pl.DataFrame, shuffle: bool = False):
    if shuffle:
        df = df.sample(fraction=1, shuffle=True)
    batches = df.with_row_index("batch").with_columns((pl.col("batch") / batch_size).floor()).partition_by("batch", include_key=False)
    return [(b["sentence"].to_list(), b["label"].to_torch()) for b in batches]

In [6]:
def evaluate(df: pl.DataFrame):
    with torch.no_grad():
        model.eval()
        preds = []
        labels = []
        for texts, label in in_batches(df):
            labels.append(label)
            features = tokenizer(texts, return_tensors="pt", padding=True)
            preds.append(model(**features)["logits"])
        preds = torch.cat(preds)
        labels = torch.cat(labels)
        loss = loss_fn(preds, labels).item()
        accuracy = multiclass_accuracy(preds, labels).item()
        return {"loss": loss, "acc": accuracy}

In [None]:
epochs = 10

for epoch in range(epochs):
    model.train()
    for texts, labels in in_batches(train_df, shuffle=True):
        optimizer.zero_grad() # zero gradients
        features = tokenizer(texts, return_tensors="pt", padding=True)
        preds = model(**features)["logits"]
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()
    print(evaluate(eval_df))

In [8]:
# Task 1
# Write a method that takes a string and predicts the sentiment with the model.

In [None]:
# Optional Task (torch)
# Try to execute the Notebook on colab.
# Try to use a gpu.

# Tip: you have to activate a gpu first, by default colab only uses cpu
# Tip: You can check if the model is running on gpu with
print(model.device) # "cpu" = cpu; "cuda" = gpu
# Tip: You can move the model to another device with
model = model.to("cpu") # "cuda" for gpu
# Tip: torch.tensors (and other torch components) work exactly the same

In [10]:
# Optional Task
# Try out a larger model for example "google-bert/bert-base-uncased"
# You can find other models at https://huggingface.co/models

In [11]:
# Optional Task
# What is better? basic_text_classififcation.ipynb or this notebook? Why? What are the differences?