### Train bs


In [None]:
!pip install transformers[torch]

In [None]:
import torch

# Torch GPU setting
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import pandas as pd
import re


base_path = 'BS-Detect/working_dir/dataset/final/'
def words_counter(testo):
    parole = re.findall(r'\w+', str(testo))
    return len(parole)

df = pd.read_excel(f'{base_path}balanced_dataset.xlsx')
df['words_n'] = df['comment'].apply(words_counter)

# Calcola la media del numero di parole
words_mean = df['words_n'].mean()


print("mean number:", words_mean)

In [None]:
from sklearn.model_selection import train_test_split

X = df['comment'].values
y = df['bs'].replace({'1': 1, '0': 0}).values
train_sentences, eval_sentences, train_labels, eval_labels = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# BERT tokenizer: To feed our text to BERT, it must be split into tokens, and then these tokens must be mapped to their index in the tokenizer vocabulary
from transformers import AutoTokenizer

huggingface_model_name = 'Musixmatch/umberto-commoncrawl-cased-v1'
#huggingface_model_name = 'm-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0'
#huggingface_model_name = 'FacebookAI/xlm-roberta-base'


# Load the BERT tokenizer
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)  # it will download and save it in a cache local directory


In [None]:
max_length = 32
num_labels = 2

input_ids = []
attention_masks = []
train_lab_tensor = torch.zeros((len(train_sentences), num_labels))

for i, sent in enumerate(train_sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True,
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

    # add label to lab_tensor
    if train_labels[i] <= float(num_labels):
      train_lab_tensor[i, int(train_labels[i])] = 1

# Convert the lists into tensors.
train_input_ids = torch.cat(input_ids, dim=0)
train_attention_masks = torch.cat(attention_masks, dim=0)


# let's encode the evaluation dataset

max_length = 32 # instead of 47, just in case there are some longer test sentences
num_labels = 2

input_ids = []
attention_masks = []
eval_lab_tensor = torch.zeros((len(eval_sentences), num_labels))

# For every sentence...
for i, sent in enumerate(eval_sentences):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True,
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

    # add label to lab_tensor
    if eval_labels[i] <= float(num_labels):
      eval_lab_tensor[i, int(eval_labels[i])] = 1

# Convert the lists into tensors.
eval_input_ids = torch.cat(input_ids, dim=0)
eval_attention_masks = torch.cat(attention_masks, dim=0)

In [None]:
import torch
from torch.utils.data import IterableDataset
from torch.utils.data import TensorDataset, random_split


class MyDataLoader(IterableDataset):

  def __init__(self, ids, mask, labels):
    super(MyDataLoader).__init__()
    self._ids = ids
    self._mask = mask
    self._labels = labels

  def __len__(self):
    return self._ids.size(dim=0)
  def __iter__(self):
    for idx in range(len(self)):
        item = dict()
        item["input_ids"] = torch.Tensor(self._ids[idx])
        item["attention_mask"] = torch.Tensor(self._mask[idx])
        item["labels"] = self._labels[idx, :]
        yield item
  def __getitem__(self, idx):
    item = dict()
    item["input_ids"] = torch.Tensor(self._ids[idx])
    item["attention_mask"] = torch.Tensor(self._mask[idx])
    item["labels"] = self._labels[idx, :]
    return item

In [None]:
# training and validation split - 90% train and 20% valid
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
trainset = MyDataLoader(train_input_ids, train_attention_masks, train_lab_tensor)
evalset = MyDataLoader(eval_input_ids, eval_attention_masks, eval_lab_tensor)

trainset, _ = random_split(trainset, [len(trainset), 0])
evalset, _ = random_split(evalset,  [len(evalset), 0])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.

train_dataloader = DataLoader(
            trainset,  # The training samples.
            sampler=RandomSampler(trainset),
            batch_size = batch_size # Train with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            evalset, # The validation samples.
            sampler=SequentialSampler(evalset),
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# define the model - we will use BERTForSequenceClassification because it has the same BERT architecture but with a single classification layer on top
from transformers import AutoModelForSequenceClassification

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.

def my_model_init():
  model = AutoModelForSequenceClassification.from_pretrained(
      huggingface_model_name,
      num_labels = 2, # The number of output labels--2 for binary classification.
                      # You can increase this for multi-class tasks.
      output_attentions = False, # Whether the model returns attentions weights.
      output_hidden_states = False, # Whether the model returns all hidden-states.
      return_dict=True
  )

  for name, param in model.named_parameters():
    if 'Bert' in name:
      param.requires_grad = False

  model.to(device)

  return model

In [None]:
from transformers import EvalPrediction
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score


def compute_metrics(p: EvalPrediction):
  y_true = p.label_ids
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  y_pred = preds.argmax(-1)
  y_true = y_true.argmax(-1)


  new_df = pd.DataFrame( )
  new_df['pred_label'] = y_pred
  new_df['true_label'] = y_true
  new_df.to_csv(f'{base_path}{huggingface_model_name}/bs/predictions.csv', header=True)

  precision = precision_score(y_true=y_true, y_pred=y_pred, average='macro',zero_division = 'warn')
  recall = recall_score(y_true=y_true, y_pred=y_pred, average='macro')
  f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
  #roc_auc = roc_auc_score(y_true, y_pred, average='micro', multi_class='ovo')

  accuracy = accuracy_score(y_true, y_pred)
  metrics = {'p': precision,
            'r': recall,
            'f1': f1_micro_average,
            #'roc_auc': roc_auc,
            'accuracy': accuracy}
  return metrics

In [None]:
from transformers import TrainerCallback

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_metric = float('inf')
        self.counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = state.log_history[-1]
        metric = metrics["eval_loss"]
        if metric < self.best_metric:
            self.best_metric = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                control.should_training_stop = True

early_stopping_callback = EarlyStoppingCallback(patience=3)


In [None]:
batch_size = 16
num_epochs = 30

lr = 5e-5
eps= 2e-10
adam_beta_1 = 0.9
adam_beta_2 = 0.999
warmup_steps = int(len(trainset) * num_epochs * 0.2)

out_dir = f'{base_path}{huggingface_model_name}/original'
num_saved_models = 1

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir=out_dir,
                                  overwrite_output_dir=True,
                                  do_train=True,
                                  do_eval=True,
                                  #do_test=True,
                                  do_predict=True,
                                  fp16=True,
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  learning_rate=lr,
                                  #adam_beta1=adam_beta_1,
                                  #adam_beta2=adam_beta_2,
                                  adam_epsilon=eps,
                                  lr_scheduler_type='linear',
                                  warmup_steps=warmup_steps,
                                  num_train_epochs=num_epochs,
                                  save_strategy='epoch',
                                  save_total_limit=num_saved_models,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='p',
                                  logging_strategy='epoch')

In [None]:
trainer = Trainer(
    model_init=my_model_init,
    args=training_args,
    train_dataset=trainset,
    eval_dataset=evalset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [None]:
trainer.train()


In [None]:
trainer.evaluate(evalset)


### Labels Second Task

In [None]:
!pip install transformers[torch]

import torch

# Torch GPU setting
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import torch
from torch.utils.data import IterableDataset
from torch.utils.data import TensorDataset, random_split
from transformers import AutoModelForSequenceClassification

from transformers import EvalPrediction
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score


batch_size = 16
num_epochs = 30

lr = 1e-5
eps= 2e-10
adam_beta_1 = 0.9
adam_beta_2 = 0.999

class MyDataLoader(IterableDataset):

  def __init__(self, ids, mask, labels):
    super(MyDataLoader).__init__()
    self._ids = ids
    self._mask = mask
    self._labels = labels

  def __len__(self):
    return self._ids.size(dim=0)
  def __iter__(self):
    for idx in range(len(self)):
        item = dict()
        item["input_ids"] = torch.Tensor(self._ids[idx])
        item["attention_mask"] = torch.Tensor(self._mask[idx])
        item["labels"] = self._labels[idx, :]
        yield item
  def __getitem__(self, idx):
    item = dict()
    item["input_ids"] = torch.Tensor(self._ids[idx])
    item["attention_mask"] = torch.Tensor(self._mask[idx])
    item["labels"] = self._labels[idx, :]
    return item




def my_model_init():
  model = AutoModelForSequenceClassification.from_pretrained(        # use DistilBertForSequenceClassification if you want
      huggingface_model_name,
      num_labels = 2,
      output_attentions = False,
      output_hidden_states = False,
      return_dict=True
  )

  for name, param in model.named_parameters():
    if 'Bert' in name:
      param.requires_grad = False

  model.to(device)

  return model


def compute_metrics(p: EvalPrediction):
  y_true = p.label_ids
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  y_pred = preds.argmax(-1)
  y_true = y_true.argmax(-1)

  precision = precision_score(y_true=y_true, y_pred=y_pred, average='macro',zero_division = 'warn')
  recall = recall_score(y_true=y_true, y_pred=y_pred, average='macro')
  f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')

  accuracy = accuracy_score(y_true, y_pred)

  metrics = {'p': precision,
            'r': recall,
            'f1': f1_micro_average,
            'accuracy': accuracy}
  return metrics


from transformers import TrainerCallback

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_metric = float('inf')
        self.counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = state.log_history[-1]
        metric = metrics["eval_loss"]
        if metric < self.best_metric:
            self.best_metric = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                control.should_training_stop = True


In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, random_split
from transformers import Trainer, TrainingArguments

import pandas as pd
import re


base_path = 'BS-detect/working_dir/dataset/final/'

df = pd.read_excel(f'{base_path}balanced_dataset.xlsx')
df = df.loc[df['bs'] == 1]
targets = ['fatphobia', 'skinny-shaming','misoginy/sexism',
         'racism', 'ableism', 'queerphobia']

for target in targets:

  X = df['comment'].values
  y = df[target].replace({'1': 1, '0': 0}).values
  train_sentences, eval_sentences, train_labels, eval_labels = train_test_split(X, y, test_size=0.2, random_state=42)
  huggingface_model_name = 'Musixmatch/umberto-commoncrawl-cased-v1'
  #huggingface_model_name = 'm-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0'
  #huggingface_model_name = 'FacebookAI/xlm-roberta-base'

  tokenizer = AutoTokenizer.from_pretrained(huggingface_model_name)

  max_length = 32
  num_labels = 2

  input_ids = []
  attention_masks = []
  train_lab_tensor = torch.zeros((len(train_sentences), num_labels))

  for i, sent in enumerate(train_sentences):

      encoded_dict = tokenizer(
                          sent,
                          add_special_tokens = True,
                          max_length = max_length,
                          padding='max_length',
                          return_tensors = 'pt',
                          truncation=True,
                    )

      input_ids.append(encoded_dict['input_ids'])

      attention_masks.append(encoded_dict['attention_mask'])

      if train_labels[i] <= float(num_labels):
        train_lab_tensor[i, int(train_labels[i])] = 1

  train_input_ids = torch.cat(input_ids, dim=0)
  train_attention_masks = torch.cat(attention_masks, dim=0)


  max_length = 32
  num_labels = 2

  input_ids = []
  attention_masks = []
  eval_lab_tensor = torch.zeros((len(eval_sentences), num_labels))

  for i, sent in enumerate(eval_sentences):

      encoded_dict = tokenizer(
                          sent,
                          add_special_tokens = True,
                          max_length = max_length,
                          padding='max_length',
                          return_tensors = 'pt',
                          truncation=True,
                    )

      input_ids.append(encoded_dict['input_ids'])

      attention_masks.append(encoded_dict['attention_mask'])

      if eval_labels[i] <= float(num_labels):
        eval_lab_tensor[i, int(eval_labels[i])] = 1

  eval_input_ids = torch.cat(input_ids, dim=0)
  eval_attention_masks = torch.cat(attention_masks, dim=0)


  trainset = MyDataLoader(train_input_ids, train_attention_masks, train_lab_tensor)
  evalset = MyDataLoader(eval_input_ids, eval_attention_masks, eval_lab_tensor)

  trainset, _ = random_split(trainset, [len(trainset), 0])
  evalset, _ = random_split(evalset,  [len(evalset), 0])


  batch_size = 16

  train_dataloader = DataLoader(
              trainset,
              sampler=RandomSampler(trainset),
              batch_size = batch_size
          )

  validation_dataloader = DataLoader(
              evalset,
              sampler=SequentialSampler(evalset),
              batch_size = batch_size
          )


  out_dir = f'{base_path}{huggingface_model_name}/{target}'
  num_saved_models = 1

  warmup_steps = int(len(trainset) * num_epochs * 0.2)

  training_args = TrainingArguments(output_dir=out_dir,
                                    overwrite_output_dir=True,
                                    do_train=True,
                                    do_eval=True,
                                    #do_test=True,
                                    do_predict=True,
                                    fp16=True,
                                    evaluation_strategy='epoch',
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    learning_rate=lr,
                                    #adam_beta1=adam_beta_1,
                                    #adam_beta2=adam_beta_2,
                                    adam_epsilon=eps,
                                    lr_scheduler_type='linear',
                                    #warmup_steps=warmup_steps,
                                    num_train_epochs=num_epochs,
                                    save_strategy='epoch',
                                    save_total_limit=num_saved_models,
                                    load_best_model_at_end=True,
                                    metric_for_best_model='p',
                                    logging_strategy='epoch')

  early_stopping_callback = EarlyStoppingCallback(patience=3)

  trainer = Trainer(
      model_init=my_model_init,
      args=training_args,
      train_dataset=trainset,
      eval_dataset=evalset,
      compute_metrics=compute_metrics,
      callbacks=[early_stopping_callback]
  )

  trainer.train()

  trainer.evaluate(evalset)
