In [2]:
from importlib.resources import files

import polars as pl
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import torch

from aml_wa24 import models

In [None]:
sst2 = load_dataset("sst2", split="train").to_polars() # sst2 is a textclassification task, more specifically sentiment analysis. (0 is negative sentiment, 1 positive sentiment)
sst2

In [4]:
eval_size = 1000
train_size = 1000
assert eval_size + train_size <= len(sst2) # ensure that the training data does not contain evaluation data

sst2 = sst2.sample(fraction=1, shuffle=True, seed=42) # shuffle df
eval_df = sst2.head(eval_size)
train_df = sst2.tail(train_size)

In [None]:
encoder = SentenceTransformer(str(files(models).joinpath("paraphrase-multilingual-MiniLM-L12-v2")))
# the enocder takes some text input and returns a tensor with 384 dimensions.
# We will take these 384 dimensions as features for a Neural Network to train a text classifier.
# Don't worry about how the encoder works. We will talk abou how the encoder works on day 3, not now.
encoding = encoder.encode("hello world", convert_to_tensor=True)
print(encoding.shape)
encoding

In [7]:
def to_torch_dataset(df: pl.DataFrame):
    labels = df["label"].to_torch() # convert labels to tensors
    texts = df["sentence"].to_list() # get texts as list
    features = encoder.encode(texts, convert_to_tensor=True) # convert texts into tensors with encoder
    torch_dataset = torch.utils.data.TensorDataset(features, labels) # create a dataset with features and labels
    return torch_dataset

In [8]:
batch_size = 4

train_dataset = to_torch_dataset(df=train_df)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

eval_dataset = to_torch_dataset(df=eval_df)
eval_loader = torch.utils.data.DataLoader(dataset=eval_dataset, batch_size=batch_size)

In [9]:
# Task 1: Train a basic Text classifier based on what you learned in the previous notebook
# Step 1: Define a model, a loss and an Optimzer.
# Step 2: Write a training loop with evaluation to train the model.
# Tip: You can copy the code from the torch_training notebook.

In [None]:
# Task 2:
# Write a method that takes some string as an input and return the prediction (positive/negative sentiment)

In [None]:
# Optional Task:
# Use a different Model from the downloaded models.

In [None]:
# Optional Task:
# Find some different textclassification datasets online and try them out.

In [None]:
# Tip: The Tasks from the previous notebook pretty much also apply to this one.