In [1]:
import os
import re
import string

import datasets
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import transformers
import wandb

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from tqdm.notebook import tqdm, trange



In [2]:
%env WANDB_PROJECT=bertweet-finetuning
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


env: WANDB_PROJECT=bertweet-finetuning


wandb: Currently logged in as: simon-andrews (umass-iesl-is). Use `wandb login --relogin` to force relogin


True

In [3]:
def tokenize(example):
    tokenizer = transformers.BertweetTokenizer.from_pretrained(
        "vinai/bertweet-base",
        do_lower_case=True
    )

    encoded_dict = tokenizer(
        example["text"],
        add_special_tokens=True, # add [CLS] and [SEP]
        max_length=64, # appropriate for tweets
        padding="max_length",
        truncation=True, # truncate large inputs
        return_attention_mask=True, # construct attention masks
    )

    return {
        "input_ids": encoded_dict["input_ids"],
        "attention_mask": encoded_dict["attention_mask"]
    }

In [4]:
# 0 --> not hate speech
# 1 --> offensive but not hateful
# 2 --> hate speech

hsol_data = datasets.load_dataset("hate_speech_offensive", split="train") \
    .rename_column("tweet", "text") \
    .rename_column("class", "label") \
    .remove_columns(["count", "hate_speech_count", "offensive_language_count", "neither_count"]) \
    .map(lambda ex: {"label": 2 if ex["label"] == 0 else ex["label"]}) \
    .train_test_split(test_size=0.2, seed=685) \
    .map(tokenize, batched=True)

hsol_data

Using custom data configuration default
Reusing dataset hate_speech_offensive (/home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-3e2f37a10615e1c9.arrow
Loading cached split indices for dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-ed6d7da02ee06499.arrow and /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-95fa89553329019c.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cac

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 19826
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 4957
    })
})

In [5]:
device = torch.device('cuda' if torch.has_cuda else 'cpu'); device

device(type='cuda')

In [6]:
def binarize_hsol_outputs(outputs):
    ret = outputs.copy()
    ret[ret == 1] = 0
    ret[ret == 2] = 1
    return ret

binarize_hsol_outputs(np.array([0, 1, 2]))

array([0, 0, 1])

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    binary_labels = binarize_hsol_outputs(labels)
    binary_predictions = binarize_hsol_outputs(predictions)

    cfm = confusion_matrix(binary_labels, binary_predictions)
    true_negatives = cfm[0][0]
    false_negatives = cfm[1][0]
    true_positives = cfm[1][1]
    false_positives = cfm[0][1]
    
    return {
        "Multiclass accuracy": accuracy_score(labels, predictions),
        "Binary accuracy": accuracy_score(binary_labels, binary_predictions),
        "F1 score": f1_score(labels, predictions, average="micro"),
        "Weighted F1 score": f1_score(labels, predictions, average="weighted"),
        "True positives": true_positives,
        "False positives": false_positives,
        "True negatives": true_negatives,
        "False negatives": false_negatives,
        "Precision": precision_score(binary_labels, binary_predictions),
        "Recall": recall_score(binary_labels, binary_predictions),
    }

compute_metrics((
    np.random.normal(size=(6, 3)),
    np.array([2, 1, 0, 2, 1, 0])
))

{'Multiclass accuracy': 0.16666666666666666,
 'Binary accuracy': 0.3333333333333333,
 'F1 score': 0.16666666666666666,
 'Weighted F1 score': 0.16666666666666666,
 'True positives': 0,
 'False positives': 2,
 'True negatives': 2,
 'False negatives': 2,
 'Precision': 0.0,
 'Recall': 0.0}

In [9]:
for learning_rate in [4e-5]: # [1e-5, 2e-5, 3e-5, 4e-5]:
    with wandb.init() as run:
        model = transformers.AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3).to(device)
        name = f"bertweet-then-hsol-with-lr-{learning_rate}"

        training_args = transformers.TrainingArguments(
            output_dir=name,
            report_to="wandb",
            run_name=name,
            num_train_epochs=3,
            learning_rate=learning_rate,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            per_device_train_batch_size=64,
            logging_steps=20,
        )

        trainer = transformers.Trainer(
            model=model,
            args=training_args,
            train_dataset=hsol_data["train"],
            eval_dataset=hsol_data["test"],
            compute_metrics=compute_metrics,
        )

        trainer.train()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/Binary accuracy,▁▆▄▂▄▇▃▅█
eval/F1 score,▁▆▄▂▄▇▃▅█
eval/False negatives,█▄▇▂▁▅▆▁▄
eval/False positives,▁▄▁██▂▃▇▃
eval/Multiclass accuracy,▁▆▄▂▄▇▃▅█
eval/Precision,▇▅█▁▁▇▆▂▆
eval/Recall,▁▅▂▇█▄▃█▅
eval/True negatives,█▅█▁▁▇▆▂▆
eval/True positives,▁▅▂▇█▄▃█▅
eval/Weighted F1 score,▁▇▄▄▅▇▄▇█

0,1
eval/Binary accuracy,0.93444
eval/F1 score,0.93444
eval/False negatives,211.0
eval/False positives,114.0
eval/Multiclass accuracy,0.93444
eval/Precision,0.88878
eval/Recall,0.81194
eval/True negatives,3721.0
eval/True positives,911.0
eval/Weighted F1 score,0.93336


loading configuration file https://huggingface.co/vinai/bertweet-base/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/356366feedcea0917e30f7f235e1e062ffc2d28138445d5672a184be756c8686.a2b6026e688d1b19cebc0981d8f3a5b1668eabfda55b2c42049d5eac0bc8cb2d
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id"

Epoch,Training Loss,Validation Loss,Multiclass accuracy,Binary accuracy,F1 score,Weighted f1 score,True positives,False positives,True negatives,False negatives,Precision,Recall
1,0.2166,0.177974,0.933831,0.933831,0.933831,0.932593,903,109,3726,219,0.892292,0.804813
2,0.1712,0.175823,0.937664,0.937664,0.937664,0.937039,937,124,3711,185,0.883129,0.835116
3,0.1547,0.181829,0.934638,0.934638,0.934638,0.933581,912,114,3721,210,0.888889,0.812834


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
Saving model checkpoint to bertweet-then-hsol-with-lr-4e-05/checkpoint-310
Configuration saved in bertweet-then-hsol-with-lr-4e-05/checkpoint-310/config.json
Model weights saved in bertweet-then-hsol-with-lr-4e-05/checkpoint-310/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
Saving model checkpoint to bertweet-then-hsol-with-lr-4e-05/checkpoint-620
Configur

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/Binary accuracy,▁█▂
eval/F1 score,▁█▂
eval/False negatives,█▁▆
eval/False positives,▁█▃
eval/Multiclass accuracy,▁█▂
eval/Precision,█▁▅
eval/Recall,▁█▃
eval/True negatives,█▁▆
eval/True positives,▁█▃
eval/Weighted F1 score,▁█▃

0,1
eval/Binary accuracy,0.93464
eval/F1 score,0.93464
eval/False negatives,210.0
eval/False positives,114.0
eval/Multiclass accuracy,0.93464
eval/Precision,0.88889
eval/Recall,0.81283
eval/True negatives,3721.0
eval/True positives,912.0
eval/Weighted F1 score,0.93358


In [12]:
model = transformers.AutoModelForPreTraining.from_pretrained("./bertweet-then-hsol-with-lr-4e-05/checkpoint-620")

model.save_pretrained("hsol-final")

loading configuration file ./bertweet-then-hsol-with-lr-4e-05/checkpoint-620/config.json
Model config RobertaConfig {
  "_name_or_path": "./bertweet-then-hsol-with-lr-4e-05/checkpoint-620",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "tokenizer_class": "BertweetTokenizer",
  "torch_dtype