In [1]:
import os
import re
import string

import datasets
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import transformers
import wandb

from sklearn.metrics import confusion_matrix, f1_score
from tqdm.notebook import tqdm, trange



In [2]:
%env WANDB_PROJECT=hsol-finetuning
wandb.login()

env: WANDB_PROJECT=hsol-finetuning


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: simon-andrews (umass-iesl-is). Use `wandb login --relogin` to force relogin


True

In [3]:
device = torch.device('cuda' if torch.has_cuda else 'cpu'); device

device(type='cuda')

In [4]:
torch.has_cuda

True

In [5]:
accuracy = datasets.load_metric("accuracy")

def compute_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [6]:
f1 = datasets.load_metric("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "f1_micro": f1_score(labels, predictions, average="micro"),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
    }

In [7]:
def compute_confusion_matrix(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels[labels == 2] = 1
    predictions[predictions == 2] = 1
    cfm = confusion_matrix(labels, predictions)
    true_negatives = cfm[0][0]
    false_negatives = cfm[1][0]
    true_positives = cfm[1][1]
    false_positives = cfm[0][1]
    return {
        "true_negatives": true_negatives,
        "false_negatives": false_negatives,
        "true_positives": true_positives,
        "false_positives": false_positives
    }

In [8]:
def compute_all_metrics(eval_pred):
    return {
        **compute_accuracy(eval_pred),
        **compute_f1(eval_pred),
        # **compute_confusion_matrix(eval_pred)
    }

In [9]:
def upsample_classes(dataset):
    unique_labels = set(dataset["label"])
    label_counts = np.ndarray(shape=len(unique_labels), dtype=int)
    for label in unique_labels:
        label_count = np.count_nonzero(np.array(dataset["label"]) == label)
        label_counts[label] = label_count

    to_insert = dict()
    for label in unique_labels:
        to_insert[label] = np.max(label_counts) - label_counts[label]

    for label, count in to_insert.items():
        matches = dataset.filter(lambda ex: ex['label'] == label)
        np.random.seed(685)
        idxs = np.random.choice(np.arange(len(matches)), size=count)
        dataset = datasets.concatenate_datasets([dataset, matches.select(idxs)])

    return dataset

In [10]:
def preprocess_tweet(text):
    text = text.strip()
    text = re.sub('&#\d+;', '', text) # remove emojis
    text = re.sub('RT @.+: ', '', text) # remove retweets
    text = re.split(' |\n', text) # split on whitespace
    text = list(map(lambda word: '<username>' if re.match('@.+', word) else word, text))
    text = list(map(lambda word: '<url>' if re.match('https?://.+', word) else word, text))
    text = list(map(lambda word: '<number>' if re.match('\d+', word) else word, text))
    text = list(map(lambda word: '' if re.match('rt', word) else word, text))
    text = ' '.join(text)
    text = text.lower() # to lowercase
    text = text.translate(text.maketrans('', '', string.punctuation.replace('<', '').replace('>', '')))
    text = text.strip()
    return text

In [11]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")

In [12]:
# 0 --> not hate speech
# 1 --> hate speech

hsol_data_binary = datasets.load_dataset("hate_speech_offensive", split="train") \
    .rename_column("tweet", "text") \
    .rename_column("class", "label") \
    .remove_columns(["count", "hate_speech_count", "offensive_language_count", "neither_count"]) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])}) \
    .map(lambda ex: tokenizer(ex["text"], padding="max_length", truncation=True, max_length=64), batched=True) \
    .map(lambda ex: {"label": 1 if ex["label"] == 0 else 0}) \
    .train_test_split(test_size=0.2, seed=685)

hsol_data_binary["train"] = upsample_classes(hsol_data_binary["train"])
hsol_data_binary = hsol_data_binary.shuffle(seed=685)
hsol_data_binary

Using custom data configuration default
Reusing dataset hate_speech_offensive (/home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-d52ad299245a54d3.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-f54c318528565eb0.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-4ef1853e16f88b2d.arrow
Loading cached split indices for dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 37312
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4957
    })
})

In [13]:
# 0 --> not hate speech
# 1 --> offensive but not hateful
# 2 --> hate speech

hsol_data_multiclass = datasets.load_dataset("hate_speech_offensive", split="train") \
    .rename_column("tweet", "text") \
    .rename_column("class", "label") \
    .remove_columns(["count", "hate_speech_count", "offensive_language_count", "neither_count"]) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])}) \
    .map(lambda ex: tokenizer(ex["text"], padding="max_length", truncation=True, max_length=64), batched=True) \
    .map(lambda ex: {"label": 2 if ex["label"] == 0 else 1 if ex["label"] == 1 else 0}) \
    .train_test_split(test_size=0.2, seed=685)

# hsol_data_multiclass["train"] = upsample_classes(hsol_data_multiclass["train"])
# hsol_data_multiclass = hsol_data_multiclass.shuffle(seed=685)
hsol_data_multiclass

Using custom data configuration default
Reusing dataset hate_speech_offensive (/home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-d52ad299245a54d3.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-f54c318528565eb0.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5/cache-bf7e2c55f7d67e09.arrow
Loading cached split indices for dataset at /home/ubuntu/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 19826
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4957
    })
})

In [12]:
# 0 --> not hate speech
# 1 --> hate speech

implicit_data_binary = pd.read_table('implicit_hate_v1_stg1_posts.tsv') \
    .rename(columns={"post": "text", "class": "label"}) \
    .replace({"not_hate": 0, "implicit_hate": 1, "explicit_hate": 1})

implicit_data_binary = datasets.Dataset.from_pandas(implicit_data_binary) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])}) \
    .map(lambda ex: tokenizer(ex["text"], padding="max_length", truncation=True), batched=True) \
    .remove_columns("text") \
    .train_test_split(test_size=0.2, seed=685)

implicit_data_binary

  0%|          | 0/21480 [00:00<?, ?ex/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17184
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4296
    })
})

In [13]:
# 0 --> not hate speech
# 1 --> implicit hate speech
# 2 --> explicit hate speech

implicit_data_multiclass = pd.read_table('implicit_hate_v1_stg1_posts.tsv') \
    .rename(columns={"post": "text", "class": "label"}) \
    .replace({"not_hate": 0, "implicit_hate": 1, "explicit_hate": 2})

implicit_data_multiclass = datasets.Dataset.from_pandas(implicit_data_multiclass) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])}) \
    .map(lambda ex: tokenizer(ex["text"], padding="max_length", truncation=True, max_length=64), batched=True) \
    .remove_columns("text") \
    .train_test_split(test_size=0.2, seed=685)

implicit_data_multiclass

  0%|          | 0/21480 [00:00<?, ?ex/s]

  0%|          | 0/22 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17184
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4296
    })
})

In [14]:
# 0 --> not hate speech
# 1 --> implicit hate speech
implicit_data_non_vs_implicit = implicit_data_multiclass \
    .filter(lambda ex: ex["label"] != 2)

implicit_data_non_vs_implicit

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16298
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4093
    })
})

In [15]:
# 0 --> incitement
# 1 --> inferiority
# 2 --> irony
# 3 --> stereotypical
# 4 --> threatening
# 5 --> white_grievance
# 6 --> other

implicit_data_fine = pd.read_table('implicit_hate_v1_stg2_posts.tsv') \
    .rename(columns={"post": "text", "implicit_class": "label"}) \
    .drop("extra_implicit_class", axis=1) \
    .replace({"incitement": 0, "inferiority": 1, "irony": 2, "stereotypical": 3, "threatening": 4, "white_grievance": 5, "other": 6})

implicit_data_fine = datasets.Dataset.from_pandas(implicit_data_fine) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])})

implicit_data_fine

  0%|          | 0/6346 [00:00<?, ?ex/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 6346
})

In [57]:
implicit_data_fine[0:100]["text"]

['jewish harvard professor noel ignatiev wants to abolish the white race via wr',
 'tune in todays jmt for my interview w  robert spencer on   the complete infidels guide to iran',
 'hate posters  you want hate buddy  read the koran  goo',
 'hunt for white women  black racist serial killer confesses to <number> murders',
 'looks like he was a fan of the islamic jewhate reza aslan peddles as critique of i',
 'antiwhites hate white solidarity  their hate for white european self preservation knows no',
 'are forever white racemass murderers',
 'i think white people are waking up in unprecedented numbers',
 'im not a racist at all but i mean all the numbers are saying caucaisian people are set to disappear soon so im for keeping the race',
 'its a planned growth of africans  a planned elimination of whites  whitegenocide  kalergi  maga tcot trump altright sjw buildawall',
 'its ok to be white  the book',
 'now it is full of jew and government infiltrators',
 'only racist if youre white  th

# Training a model on binary HSOL

In [14]:
model = transformers.BertForSequenceClassification.from_pretrained("bert-base-cased").to(device)

training_args = transformers.TrainingArguments(
    output_dir="bert-then-hsol-binary",
    report_to="wandb",
    run_name="bert-then-hsol-binary",
    evaluation_strategy="steps",
    per_device_train_batch_size=64,
    eval_steps=50,
    logging_steps=20,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=hsol_data_binary["train"],
    eval_dataset=hsol_data_binary["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("bert-then-hsol-binary")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
50,0.4964,0.513783,0.755699,0.755699,0.822696
100,0.3546,0.607534,0.755901,0.755901,0.822956
150,0.3243,0.361355,0.850918,0.850918,0.886252
200,0.2293,0.303698,0.88602,0.88602,0.908681
250,0.2256,0.304908,0.88602,0.88602,0.909211
300,0.1625,0.258688,0.915877,0.915877,0.927686
350,0.1456,0.227159,0.926165,0.926165,0.934434
400,0.1177,0.228077,0.925358,0.925358,0.933568
450,0.1248,0.236842,0.93484,0.93484,0.93912
500,0.1252,0.221379,0.934234,0.934234,0.939507


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following

In [14]:
model = transformers.BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3).to(device)

training_args = transformers.TrainingArguments(
    output_dir="bert-then-hsol-multiclass",
    report_to="wandb",
    run_name="bert-then-hsol-multiclass",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=20,
    per_device_train_batch_size=64,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=hsol_data_multiclass["train"],
    eval_dataset=hsol_data_multiclass["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("bert-then-hsol-multiclass")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
100,0.3229,0.252069,0.912245,0.912245,0.888233
200,0.2795,0.243769,0.913859,0.913859,0.905973
300,0.2715,0.238251,0.916683,0.916683,0.916878
400,0.2237,0.238338,0.912245,0.912245,0.902585
500,0.2355,0.230743,0.91507,0.91507,0.914042
600,0.1937,0.23254,0.92092,0.92092,0.917987
700,0.1567,0.248942,0.917894,0.917894,0.91589
800,0.1716,0.260986,0.916885,0.916885,0.915334
900,0.203,0.254654,0.915877,0.915877,0.914176


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4957
  Batch size = 8
The following

In [21]:
model = transformers.BertForSequenceClassification.from_pretrained("./bert-then-hsol-multiclass", num_labels=3).to(device)

training_args = transformers.TrainingArguments(
    output_dir="hsol-multiclass-then-implicit-multiclass",
    report_to="wandb",
    run_name="hsol-multiclass-then-implicit-multiclass",
    evaluation_strategy="steps",
    eval_steps=50,
    per_device_train_batch_size=64,
    logging_steps=20,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=implicit_data_multiclass["train"],
    eval_dataset=implicit_data_multiclass["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("hsol-multiclass-then-implicit-multiclass")

loading configuration file ./bert-then-hsol-multiclass/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file ./bert-t

Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
50,0.6954,0.657388,0.689711,0.689711,0.669289
100,0.6841,0.63236,0.707402,0.707402,0.680949
150,0.628,0.633476,0.713222,0.713222,0.702077
200,0.6186,0.618286,0.717412,0.717412,0.714428
250,0.642,0.590512,0.733706,0.733706,0.725132
300,0.5096,0.609772,0.728585,0.728585,0.710317
350,0.4922,0.641132,0.728119,0.728119,0.725921
400,0.5111,0.598364,0.731844,0.731844,0.725463
450,0.4809,0.62123,0.736266,0.736266,0.721599
500,0.4775,0.613282,0.737197,0.737197,0.725455


***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4296
  Batch size = 8
Saving model checkpoint to hsol-multiclass-then-implicit-multiclass/checkpoint-500
Configuration saved in hsol-multiclass-then-implicit-multiclass/checkpoint-500/config.json
Model weights saved in hsol-multiclass-then-implicit-multiclass/checkpoint-500/pytorch_model.bin
***** Running Evaluation ****

In [15]:
model = transformers.BertForSequenceClassification.from_pretrained("./bert-then-hsol-binary", num_labels=2).to(device)

training_args = transformers.TrainingArguments(
    output_dir="hsol-binary-then-implicit-hate-vs-non",
    report_to="none",
    run_name="hsol-binary-then-implicit-hate-vs-non",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=20,
    per_device_train_batch_size=64,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=implicit_data_non_vs_implicit["train"],
    eval_dataset=implicit_data_non_vs_implicit["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("hsol-binary-then-implicit-hate-vs-non")

***** Running training *****
  Num examples = 16298
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 765


Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
100,0.554,0.504652,0.742976,0.742976,0.731466
200,0.492,0.488934,0.753237,0.753237,0.743944
300,0.3948,0.494043,0.760567,0.760567,0.756225
400,0.3687,0.505437,0.763987,0.763987,0.755989
500,0.384,0.490211,0.769607,0.769607,0.767726
600,0.2349,0.605169,0.760078,0.760078,0.75779
700,0.2209,0.614214,0.763743,0.763743,0.763401


***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
Saving model checkpoint to hsol-binary-then-implicit-hate-vs-non/checkpoint-500
Configuration saved in hsol-binary-then-implicit-hate-vs-non/checkpoint-500/config.json
Model weights saved in hsol-binary-then-implicit-hate-vs-non/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in hsol-binary-then-implicit-hate-vs-non/config.json
Model weights saved in hsol-binary-then-implicit-hate-vs-non/pytorch_model.bin


In [65]:
sarc_data = pd.read_csv('SARC2/sarc_processed.csv') \
    .drop(columns=['Unnamed: 0']) \
    .sample(frac=0.5) \
    .dropna()

sarc_data = datasets.Dataset.from_pandas(sarc_data) \
    .map(lambda ex: {"text": preprocess_tweet(ex["text"])}) \
    .map(lambda ex: tokenizer(ex["text"], padding="max_length", truncation=True, max_length=64), batched=True) \
    .remove_columns("text") \
    .train_test_split(test_size=0.2, seed=685) \
    .remove_columns(['__index_level_0__'])

sarc_data

  0%|          | 0/32332 [00:00<?, ?ex/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25865
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6467
    })
})

In [66]:
model = transformers.BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2).to(device)

training_args = transformers.TrainingArguments(
    output_dir="bert-then-sarc",
    report_to="wandb",
    run_name="bert-then-sarc",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=20,
    per_device_train_batch_size=64,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=sarc_data["train"],
    eval_dataset=sarc_data["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("bert-then-sarc")

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin from cac

Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
100,0.6777,0.661592,0.59734,0.59734,0.586072
200,0.6689,0.634709,0.644348,0.644348,0.642282
300,0.6338,0.618442,0.659038,0.659038,0.650149
400,0.6174,0.60476,0.6728,0.6728,0.671645
500,0.5545,0.612572,0.671563,0.671563,0.671545
600,0.5629,0.635252,0.661358,0.661358,0.660389
700,0.5333,0.642238,0.669398,0.669398,0.667883
800,0.5258,0.622533,0.678831,0.678831,0.678097
900,0.3801,0.718209,0.665997,0.665997,0.665627
1000,0.3498,0.737332,0.667698,0.667698,0.667396


***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
Saving model checkpoint to bert-then-sarc/checkpoint-500
Configuration saved in bert-then-sarc/checkpoint-500/config.json
Model weights saved in bert-then-sarc/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6467
  Batch size = 8
Saving model checkpoint to bert-then-sarc/checkpoint-1000
Configuration saved in bert-then-sarc/checkpoint-

In [69]:
sarc_data["train"].to_csv("SARC2/train-final.csv")

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

16669397

In [70]:
sarc_data["test"].to_csv("SARC2/test-final.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

4164649

In [72]:
sarc_data["train"]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25865
})

In [16]:
model = transformers.BertForSequenceClassification.from_pretrained("./bert-then-sarc").to(device)

training_args = transformers.TrainingArguments(
    output_dir="sarc-then-implicit-hate-vs-non",
    report_to="wandb",
    run_name="sarc-then-implicit-hate-vs-non",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=20,
    per_device_train_batch_size=64,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=implicit_data_non_vs_implicit["train"],
    eval_dataset=implicit_data_non_vs_implicit["test"],
    compute_metrics=compute_all_metrics,
)

trainer.train()

model.save_pretrained("sarc-then-implicit-hate-vs-non")

***** Running training *****
  Num examples = 16298
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 765
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Weighted
100,0.5359,0.509153,0.745175,0.745175,0.728623
200,0.4855,0.484106,0.758857,0.758857,0.753051
300,0.4141,0.49697,0.757391,0.757391,0.750679
400,0.3703,0.502297,0.76643,0.76643,0.76056
500,0.3896,0.501058,0.766186,0.766186,0.761148
600,0.2466,0.62056,0.758368,0.758368,0.755906
700,0.2258,0.634312,0.759101,0.759101,0.759029


***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
Saving model checkpoint to sarc-then-implicit-hate-vs-non/checkpoint-500
Configuration saved in sarc-then-implicit-hate-vs-non/checkpoint-500/config.json
Model weights saved in sarc-then-implicit-hate-vs-non/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8
***** Running Evaluation *****
  Num examples = 4093
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in sarc-then-implicit-hate-vs-non/config.json
Model weights saved in sarc-then-implicit-hate-vs-non/pytorch_model.bin
