In [1]:
from IPython import display

In [52]:
# !pip install transformers datasets huggingface_hub tensorboard==2.11
# !pip install scikit-learn
display.clear_output()

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

In [4]:
model_id = "roberta-base"
repository_id = "achimoraites/roberta-base_ag_news"
dataset_path = "../../data/new_data.csv"

In [5]:
ds = load_dataset("csv", data_files=dataset_path)["train"]
ds = ds.rename_column("R2DiscussionType", "label")
ds.set_format("torch", columns=["Message", "label"])
ds = ds.train_test_split(test_size=0.2)
ds = ds.class_encode_column("label")
print(ds)

Flattening the indices:   0%|          | 0/488 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/488 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/123 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/123 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Course', 'Book ID', 'Bookclub', 'Pseudonym', 'Message', 'Message Time', 'Page', 'label', 'R2DialogicSpell', 'R2Uptake', 'R2 Question', 'R2 Pivot'],
        num_rows: 488
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Course', 'Book ID', 'Bookclub', 'Pseudonym', 'Message', 'Message Time', 'Page', 'label', 'R2DialogicSpell', 'R2Uptake', 'R2 Question', 'R2 Pivot'],
        num_rows: 123
    })
})


In [6]:
# Training and testing datasets
train_dataset = ds['train']
test_dataset = ds["test"]

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)
def tokenize(batch):
    return tokenizer(batch["Message"], padding=True, truncation=True, max_length=256)


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))




Map:   0%|          | 0/488 [00:00<?, ? examples/s]

Map:   0%|          | 0/123 [00:00<?, ? examples/s]

In [8]:
# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
num_labels = ds['train'].features['label'].num_classes
print(num_labels)
class_names = ds["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}
print(id2label)
# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

12
number of labels: 12
the labels: ['Deliberation', 'Deliberation, Seminar', 'Imaginative', 'Imaginative entry', 'Other', 'Procedure', 'Seminar', 'Seminar, Deliberation', 'Social', 'Social, Deliberation', 'Social, Procedure', 'UX']
{0: 'Deliberation', 1: 'Deliberation, Seminar', 2: 'Imaginative', 3: 'Imaginative entry', 4: 'Other', 5: 'Procedure', 6: 'Seminar', 7: 'Seminar, Deliberation', 8: 'Social', 9: 'Social, Deliberation', 10: 'Social, Procedure', 11: 'UX'}


In [56]:
from datasets import load_metric
import numpy as np
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [57]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="train_out",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_dir=f"train_out/logs",
    logging_strategy="steps",
    logging_steps=1,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.2779,2.583391,0.065041
2,1.9629,3.857165,0.065041
3,0.8381,4.522352,0.097561


TrainOutput(global_step=48, training_loss=1.79818710933129, metrics={'train_runtime': 39.0043, 'train_samples_per_second': 37.534, 'train_steps_per_second': 1.231, 'total_flos': 154994548913280.0, 'train_loss': 1.79818710933129, 'epoch': 3.0})

In [64]:
from transformers import pipeline

classifier = pipeline('text-classification', model, tokenizer=tokenizer, device="cuda")

text = [
  "I believe that the tiger never liked this girl in the first place",
  "My button didnt really work when trying",
  "Hello how are you all"
  ]
result = classifier(text)
result

# predicted_label = result[0]["label"]
# print(f"Predicted label: {predicted_label}")

[{'label': 'Seminar', 'score': 0.1144128367304802},
 {'label': 'Seminar', 'score': 0.11151367425918579},
 {'label': 'Seminar', 'score': 0.10919561237096786}]

## Performance evaluation

In [65]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 2.583390951156616,
 'eval_accuracy': 0.06504065040650407,
 'eval_runtime': 0.7789,
 'eval_samples_per_second': 157.919,
 'eval_steps_per_second': 5.136,
 'epoch': 3.0}