In [None]:
! pip install datasets transformers optuna

# Detector Development

In [1]:
TRANSFORMED_DATA_PATH = "../data/transformed"

## Initial training with Spanish data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

cleaned_data = pd.read_csv(f"{TRANSFORMED_DATA_PATH}/cleaned_data.csv", sep=";")
data_es = cleaned_data.loc[cleaned_data.language == "es",:]
data = data_es.drop(["original_id","type", "language","dataset"], axis=1).copy()

train_data, eval_data = train_test_split(data, test_size = 0.2, stratify = data.label, shuffle=True, seed=1234)
val_data, test_data = train_test_split(eval_data, test_size = 0.5, stratify = eval_data.label, seed=1234)

data_files = {"train": train_data, "validation": val_data, "test": test_data}

def make_dataset_from_pandas(data: dict):
    ds_dict = DatasetDict()
    for split, df in data.items():
        ds_dict[split] = Dataset.from_pandas(df).remove_columns("__index_level_0__")
    return ds_dict

sexism_dataset = make_dataset_from_pandas(data_files)
sexism_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9154
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1144
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1145
    })
})

In [3]:
from transformers import AutoTokenizer

MODEL_CHECKPOINT = "dccuchile/bert-base-spanish-wwm-uncased"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

def tokenize(examples):
    return TOKENIZER(examples["text"], padding='max_length', max_length = 512, truncation = True)

sexism_tokenized = sexism_dataset.map(tokenize, batched=True)
sexism_tokenized

Downloading (…)okenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

Map:   0%|          | 0/9154 [00:00<?, ? examples/s]

Map:   0%|          | 0/1144 [00:00<?, ? examples/s]

Map:   0%|          | 0/1145 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9154
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1144
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1145
    })
})

In [4]:
from datasets import load_metric

f1 = load_metric("f1")
accuracy = load_metric("accuracy")
precision = load_metric("precision")
recall = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        **accuracy.compute(predictions=predictions, references=labels),
        **f1.compute(predictions=predictions, references=labels),
        **precision.compute(predictions=predictions, references=labels),
        **recall.compute(predictions=predictions, references=labels)
    }

  f1 = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

BATCH_SIZE = 16

model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

args = TrainingArguments(
    "unicc/beto-sexism-detection",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs = 4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

trainer = Trainer(
    model,
    args,
    train_dataset=sexism_tokenized["train"],
    eval_dataset=sexism_tokenized["validation"],
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuc

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.554,0.491923,0.766608,0.708197,0.835052,0.614801
2,0.3919,0.555882,0.793706,0.776938,0.774011,0.779886
3,0.2489,0.688962,0.781469,0.775986,0.735144,0.821632
4,0.1255,0.997715,0.784091,0.768943,0.758303,0.779886


TrainOutput(global_step=2292, training_loss=0.29920243051871787, metrics={'train_runtime': 3599.0299, 'train_samples_per_second': 10.174, 'train_steps_per_second': 0.637, 'total_flos': 9634074403061760.0, 'train_loss': 0.29920243051871787, 'epoch': 4.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.5558816194534302,
 'eval_accuracy': 0.7937062937062938,
 'eval_f1': 0.776937618147448,
 'eval_precision': 0.7740112994350282,
 'eval_recall': 0.7798861480075902,
 'eval_runtime': 42.7045,
 'eval_samples_per_second': 26.789,
 'eval_steps_per_second': 1.686,
 'epoch': 4.0}

In [15]:
preds = trainer.predict(sexism_tokenized["test"])
preds.metrics

{'test_loss': 0.641056478023529,
 'test_accuracy': 0.7703056768558952,
 'test_f1': 0.753514526710403,
 'test_precision': 0.7458256029684601,
 'test_recall': 0.7613636363636364,
 'test_runtime': 43.0277,
 'test_samples_per_second': 26.611,
 'test_steps_per_second': 1.673}

In [18]:
trainer.save_model("../src/detector/beto-sexism-detection.pkl")

### Hyper-parameter search

Still working on it...

In [None]:
! pip install optuna

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT)

BATCH_SIZE = 16

args2 = TrainingArguments(
    "beto-finetuned-sexism-hpsearch",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

trainer_hp = Trainer(
    model_init=model_init,
    args=args2,
    train_dataset=sexism_tokenized["train"],
    eval_dataset=sexism_tokenized["validation"],
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics
)

best_run = trainer_hp.hyperparameter_search(n_trials=10, direction="maximize")

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer_hp.args, n, v)

trainer_hp.train()

In [None]:
trainer_hp.evaluate()

In [None]:
preds = trainer_hp.predict(sexism_tokenized["test"])
preds.metrics

## Full train including translated data

With the best set of hyperparameter we can train over the full dataset obtained after translating the tweets and see if any improvement occurs.

In [None]:
translated_data = pd.read_csv("translated_data.csv", sep=";")
data_full = translated_data.loc[translated_data.language != "en",:]
data2 = data_full.drop(["original_id","type", "language","dataset"], axis=1).copy()

full_train_data, full_eval_data = train_test_split(data2, test_size = 0.2, stratify = data.label, shuffle=True)
full_val_data, full_test_data = train_test_split(full_eval_data, test_size = 0.5, stratify = eval_data.label)

data_files = {"train": full_train_data, "validation": full_val_data, "test": full_test_data}

sexism_dataset_full = make_dataset_from_pandas(data_files)
sexism_dataset_full

In [None]:
sexism_tokenized_full = sexism_dataset_full.map(tokenize, batched=True)
sexism_tokenized_full

In [None]:
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

args3 = TrainingArguments(
    "unicc/beto-finetuned-sexism",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=True
)

for n, v in best_run.hyperparameters.items():
    setattr(args3, n, v)
#setattr(trainer.args, "num_train_epochs", 1)

args3

In [None]:
trainer3 = Trainer(
    model_init=model3
    args=args3,
    train_dataset=sexism_tokenized_full["train"],
    eval_dataset=sexism_tokenized_full["validation"],
    tokenizer=TOKENIZER,
    compute_metrics=compute_metrics
)

trainer3.train()

In [None]:
preds = trainer.predict(sexism_tokenized["test"])
preds.metrics