In [1]:
import os
import wget

from datasets import load_dataset, load_metric, Dataset, DatasetDict
import pandas as pd
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizerFast, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
liar_url = "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"
if not os.path.exists("./data/"):
  os.mkdir("./data/")
if not os.path.exists("./data/liar_dataset.zip"):
  wget.download(liar_url, "./data/liar_dataset.zip")
if not os.path.exists("./data/train.tsv"):
  !unzip ./data/liar_dataset.zip -d ./data/

In [3]:
train_df = pd.read_csv("./data/train.tsv", sep="\t", usecols=[1,2], names=["label", "text"])
valid_df = pd.read_csv("./data/valid.tsv", sep="\t", usecols=[1,2], names=["label", "text"])

label_dict = {
    "true":0,
    "mostly-true":0,
    "half-true":0,
    "barely-true":1,
    "false":1,
    "pants-fire":1
    }

train_df["label"] = [label_dict[lab] for lab in train_df.label]
valid_df["label"] = [label_dict[lab] for lab in valid_df.label]

In [4]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base", padding=True, truncation=True)

def encode(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

In [5]:
liar_train = Dataset.from_pandas(train_df)
liar_valid = Dataset.from_pandas(valid_df)
liar_train = liar_train.map(encode, batched=True)
liar_valid = liar_valid.map(encode, batched=True)
data_dict = DatasetDict({"train":liar_train, "valid":liar_valid})


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [6]:
roberta_clf = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
train_args = TrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    output_dir="./outputs/"
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted', zero_division=1)
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [9]:
trainer = Trainer(
    roberta_clf,
    train_args,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

  0%|          | 0/3200 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 3.81 GiB of which 112.06 MiB is free. Including non-PyTorch memory, this process has 3.70 GiB memory in use. Of the allocated memory 3.02 GiB is allocated by PyTorch, and 40.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)