In [1]:
from datasets import load_dataset, load_metric, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification,  TrainingArguments, Trainer  

import numpy as np

In [2]:
dataset = load_dataset('tweets_hate_speech_detection')
metric = load_metric('accuracy')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/stas/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0)


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'tweet'],
        num_rows: 31962
    })
})

In [4]:
dataset = dataset['train'].train_test_split(test_size=0.3)
dataset

Loading cached split indices for dataset at /home/stas/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0/cache-249d93807d60b406.arrow and /home/stas/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0/cache-38382b74cf69f649.arrow


DatasetDict({
    train: Dataset({
        features: ['label', 'tweet'],
        num_rows: 22373
    })
    test: Dataset({
        features: ['label', 'tweet'],
        num_rows: 9589
    })
})

In [5]:
batch_size = 3

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", use_fast=True)

In [7]:
def preprocess_function(examples):
    return tokenizer(examples['tweet'])

In [8]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /home/stas/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0/cache-78d2a739939d6604.arrow
Loading cached processed dataset at /home/stas/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0/cache-59c0b9951d04f7d7.arrow


In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [11]:
args = TrainingArguments(
    "test",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

In [12]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.evaluate(encoded_dataset['test'])

{'eval_loss': 0.7709921598434448,
 'eval_accuracy': 0.06882886640942747,
 'eval_runtime': 51.8373,
 'eval_samples_per_second': 184.983,
 'init_mem_cpu_alloc_delta': 1988603904,
 'init_mem_gpu_alloc_delta': 499887104,
 'init_mem_cpu_peaked_delta': 153870336,
 'init_mem_gpu_peaked_delta': 0,
 'eval_mem_cpu_alloc_delta': 16838656,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 34740736}

In [14]:
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.2014,0.237843,0.953593,56.6694,169.209
2,0.3371,0.345504,0.931171,51.7213,185.398
3,0.358,0.358331,0.931171,50.4925,189.909
4,0.3208,0.330226,0.931171,52.1937,183.719
5,0.3565,0.331556,0.931171,51.0275,187.918


In [15]:
trainer.evaluate(encoded_dataset['test'])

{'eval_loss': 0.2378431260585785,
 'eval_accuracy': 0.9535926582542497,
 'eval_runtime': 50.6061,
 'eval_samples_per_second': 189.483,
 'epoch': 5.0,
 'eval_mem_cpu_alloc_delta': 81920,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 33926656}

Доля правильных ответ на валидационном множестве до обучения: 0.06882886640942747
Доля правильных ответ на валидационном множестве до обучения: 0.9535926582542497

In [16]:
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
import torch
import matplotlib.pyplot as plt

In [17]:
def roc_curve_plot(fpr,tpr,roc_auc):
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' %roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.show()

In [18]:
inputs = tokenizer(dataset['train']['tweet'], truncation=True, padding=True, max_length=30,return_tensors="pt")['input_ids'].cuda()
outputs = model(inputs)

y_valid = dataset['train']['label']

RuntimeError: CUDA out of memory. Tried to allocate 1.92 GiB (GPU 0; 5.80 GiB total capacity; 2.35 GiB already allocated; 1.72 GiB free; 2.56 GiB reserved in total by PyTorch)

In [None]:
with torch.no_grad():
    y_predict_prob = np.argmax(outputs.logits.cpu(), axis=1)
    fpr, tpr, _ = roc_curve(y_valid,y_predict_prob)
    roc_auc = auc(fpr, tpr)
    roc_curve_plot(fpr,tpr,roc_auc)

К сожалению, не хватило памяти