# Instalações e bibliotecas necessárias

In [1]:
!pip install transformers -U
!pip install peft datasets torch peft
!pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.43.4-py3-none-any.whl (9.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.43.3
    Uninstalling transformers-4.43.3:
      Successfully uninstalled transformers-4.43.3
Successfully installed transformers-4.43.4
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForPreTraining

# Carregar o modelo
from transformers import BertForSequenceClassification

2024-08-05 09:41:53.457323: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-05 09:41:53.614499: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 09:41:53.680631: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 09:41:53.697871: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-05 09:41:53.813239: I tensorflow/core/platform/cpu_feature_guar

In [3]:
import torch
torch.cuda.empty_cache()

# BERT

### Treino

In [4]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

class BertWithPrefix(torch.nn.Module):
    def __init__(self, bert_model, prefix_tokens):
        super().__init__()
        self.bert = bert_model
        self.prefix_tokens = torch.nn.Parameter(prefix_tokens.clone().detach().float())
        for name, param in self.bert.named_parameters():
            if 'classifier' not in name:
                param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, labels=None):
        device = input_ids.device
        batch_prefix_tokens = self.prefix_tokens.to(device).unsqueeze(0).repeat(input_ids.size(0), 1)
        extended_input_ids = torch.cat([batch_prefix_tokens.long(), input_ids], dim=1)
        if attention_mask is not None:
            prefix_mask = torch.ones(batch_prefix_tokens.size(), dtype=torch.long, device=device)
            extended_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)
        else:
            extended_attention_mask = None
        return self.bert(input_ids=extended_input_ids, attention_mask=extended_attention_mask, labels=labels)

def clean_text(text):
    """ Limpeza básica de texto """
    text = text.lower()  # minúsculas
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remover URLs
    text = re.sub(r'@\w+', '', text)  # remover menções
    text = re.sub(r'\d+', '', text)  # remover números
    text = re.sub(r'[^A-Za-záéíóúàèìòùâêîôûãõ\b]', ' ', text)  # remover caracteres especiais
    return text.strip()

# Carregar e preparar dataset
data = pd.read_csv('./repos/HEDOS/HEDOS.csv')
data['text'] = data['text'].apply(clean_text)
filtered_data = data[data['final_label'] != 'Lixo'].copy()
filtered_data['final_label'] = filtered_data['final_label'].map({'not_toxic': 0, 'toxic': 1})
filtered_data.dropna(subset=['text', 'final_label'], inplace=True)

# Balanceamento de dados
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(filtered_data[['text']], filtered_data['final_label'])
resampled_data = pd.DataFrame({'text': X_resampled['text'], 'final_label': y_resampled})

# Preparar tokenizer e modelo
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define and prepare prefix
prefix_text = "Classifique como tóxico qualquer texto que contenha palavões!"
prefix_tokens = tokenizer(prefix_text, return_tensors="pt", add_special_tokens=False)['input_ids'][0]

def preprocess_function(examples, labels):
    tokenized_inputs = tokenizer(examples, padding="max_length", truncation=True, max_length=512-len(prefix_tokens))
    return {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'labels': labels}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average='binary'),
        "recall": recall_score(labels, predictions, average='binary'),
        "f1": f1_score(labels, predictions, average='binary'),
    }

# Definindo K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5)
results = []

for train_index, test_index in kf.split(resampled_data['text'], resampled_data['final_label']):
    train_data = resampled_data.iloc[train_index]
    test_data = resampled_data.iloc[test_index]

    train_encodings = preprocess_function(train_data['text'].tolist(), train_data['final_label'].tolist())
    val_encodings = preprocess_function(test_data['text'].tolist(), test_data['final_label'].tolist())

    train_dataset = Dataset.from_dict(train_encodings)
    eval_dataset = Dataset.from_dict(val_encodings)

    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy='epoch',
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        fp16=True  # Usar GPU
    )

    bert_with_prefix = BertWithPrefix(model, prefix_tokens)
    trainer = Trainer(
        model=bert_with_prefix,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    train_result = trainer.train()
    eval_result = trainer.evaluate()
    results.append(eval_result)

# Consolidar resultados
average_results = {key: np.mean([dic[key] for dic in results]) for key in results[0]}
print("Média dos resultados da validação cruzada:", average_results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6845,0.671947,0.603138,0.602064,0.609756,0.605886
1,0.6866,0.671656,0.59907,0.59596,0.616725,0.606164
2,0.6844,0.672208,0.59907,0.589155,0.656214,0.620879




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6833,0.674301,0.57699,0.565217,0.665116,0.611111
1,0.6741,0.675113,0.572342,0.55709,0.703488,0.621788
2,0.678,0.674047,0.578152,0.567404,0.655814,0.608414




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6635,0.675204,0.584302,0.571429,0.674419,0.618667
1,0.6714,0.674249,0.58314,0.578142,0.615116,0.596056
2,0.6701,0.674128,0.580233,0.578054,0.594186,0.586009




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6687,0.67475,0.587791,0.578728,0.645349,0.610225
1,0.6714,0.674648,0.585465,0.586572,0.57907,0.582797
2,0.6899,0.674453,0.588372,0.587356,0.594186,0.590751




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.6886,0.651336,0.630233,0.614053,0.701163,0.654723


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

{'eval_loss': 0.650933563709259,
 'eval_accuracy': 0.622093023255814,
 'eval_precision': 0.6011560693641619,
 'eval_recall': 0.7255813953488373,
 'eval_f1': 0.6575342465753424,
 'eval_runtime': 433.9763,
 'eval_samples_per_second': 3.963,
 'eval_steps_per_second': 0.495,
 'epoch': 2.998256827425915}

In [None]:
trainer.save_model('./results/PEFT/HEDOS_BERTPrefixTuningCustomModel')

In [None]:
# Para 'model' é como instância de BertForSequenceClassification e 'tokenizer'  instância de BertTokenizer
model.save_pretrained("./results/PEFT/HEDOS_BERTPrefixTuningCustomModel")
tokenizer.save_pretrained("./results/PEFT/HEDOS_BERTPrefixTuningCustomModel")

('./results/PEFT/HEDOS_BERTPrefixTuningCustomModel/tokenizer_config.json',
 './results/PEFT/HEDOS_BERTPrefixTuningCustomModel/special_tokens_map.json',
 './results/PEFT/HEDOS_BERTPrefixTuningCustomModel/vocab.txt',
 './results/PEFT/HEDOS_BERTPrefixTuningCustomModel/added_tokens.json')

### Validação

In [None]:
# Preparar o tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Carregar o modelo
model_2 = BertForSequenceClassification.from_pretrained("./results/PEFT/HEDOS_BERTPrefixTuningCustomModel")
model_2.to('cuda')

In [None]:
# Definir o prefixo e configurações de tokenizer se usadas durante o treinamento
prefix_text = "Classifique como tóxico qualquer texto que contenha palavões!"
prefix_tokens = tokenizer(prefix_text, return_tensors="pt", add_special_tokens=False)['input_ids'][0]
max_length = 512 - len(prefix_tokens)

# Textos de exemplo para avaliação
examples = [
    "Você é incrível!",
    "Eu odeio quando você age assim!",
    "Esse lugar é horrível e as pessoas são terríveis.",
    "Que dia maravilhoso para um passeio no parque!",
    "Você é um idiota e ninguém gosta de você.",
    "vá para o inferno",
    "demonho",
    "puta que pariu, eu amei!",
    "nordestino nojento"
]

# Tokenizar os exemplos
#encoded_inputs = tokenizer(examples, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Garantir que os inputs estejam no dispositivo CUDA
encoded_inputs = {key: val.to('cuda') for key, val in encoded_inputs.items()}

# Adicionar prefixo aos inputs
device = 'gpu'  # ou 'cpu' se CUDA não estiver disponível
batch_prefix_tokens = prefix_tokens.to(device).unsqueeze(0).repeat(encoded_inputs['input_ids'].size(0), 1)
extended_input_ids = torch.cat([batch_prefix_tokens, encoded_inputs['input_ids']], dim=1)

# Garantir que a attention mask também esteja ajustada
prefix_mask = torch.ones(batch_prefix_tokens.size(), dtype=torch.long, device=device)
extended_attention_mask = torch.cat([prefix_mask, encoded_inputs['attention_mask']], dim=1)

# Obter previsões
with torch.no_grad():
    outputs = model_2(input_ids=extended_input_ids, attention_mask=extended_attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)
    probabilities = F.softmax(outputs.logits, dim=-1)  # Softmax sobre os logits para probabilidades

# Imprimir os resultados
for text, pred, prob in zip(examples, predictions, probabilities):
    prob_t = prob[pred].item() * 100  # Probabilidade da classe prevista
    print(f"Sentença: {text} - Tóxico: {'Sim' if pred.item() == 1 else 'Não'} - Probabilidade: {prob_t:.2f}%")


Sentença: Você é incrível! - Tóxico: Sim - Probabilidade: 66.69%
Sentença: Eu odeio quando você age assim! - Tóxico: Sim - Probabilidade: 63.09%
Sentença: Esse lugar é horrível e as pessoas são terríveis. - Tóxico: Sim - Probabilidade: 69.35%
Sentença: Que dia maravilhoso para um passeio no parque! - Tóxico: Sim - Probabilidade: 61.95%
Sentença: Você é um idiota e ninguém gosta de você. - Tóxico: Sim - Probabilidade: 72.66%
Sentença: vá para o inferno - Tóxico: Sim - Probabilidade: 65.48%
Sentença: demonho - Tóxico: Sim - Probabilidade: 63.66%
Sentença: puta que pariu, eu amei! - Tóxico: Sim - Probabilidade: 67.89%
Sentença: nordestino nojento - Tóxico: Sim - Probabilidade: 61.39%


# BERTimbal

### Treino

In [None]:
import torch
print(torch.cuda.is_available())


False


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

class BertWithPrefix(torch.nn.Module):
    def __init__(self, bert_model, prefix_tokens):
        super().__init__()
        self.bert = bert_model
        self.prefix_tokens = torch.nn.Parameter(prefix_tokens.clone().detach().float())

        # Congelar os pesos do modelo BERT, exceto para a camada de classificação
        for name, param in self.bert.named_parameters():
            if 'classifier' not in name:
                param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, labels=None):
        batch_prefix_tokens = self.prefix_tokens.unsqueeze(0).repeat(input_ids.size(0), 1)
        extended_input_ids = torch.cat([batch_prefix_tokens.long(), input_ids], dim=1)

        if attention_mask is not None:
            prefix_mask = torch.ones(batch_prefix_tokens.size(), dtype=torch.long)
            extended_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)
        else:
            extended_attention_mask = None

        outputs = self.bert(input_ids=extended_input_ids, attention_mask=extended_attention_mask, labels=labels)
        return outputs

# Carregar dados
data = pd.read_csv('./repos/HEDOS/HEDOS.csv')
filtered_data = data[data['final_label'] != 'Lixo']
label_mapping = {'not_toxic': 0, 'toxic': 1}
filtered_data['final_label'] = filtered_data['final_label'].map(label_mapping)

X_train, X_val, y_train, y_val = train_test_split(filtered_data['text'], filtered_data['final_label'], test_size=0.2, stratify=filtered_data['final_label'], random_state=42)

# Carregar o tokenizer e o modelo BERT pré-treinado para classificação de sentenças
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)

# Define and prepare prefix
prefix_text = "Classifique como tóxico qualquer texto que contenha palavões!"
prefix_tokens = tokenizer(prefix_text, return_tensors="pt", add_special_tokens=False)['input_ids'][0]
max_length = 512 - len(prefix_tokens)

def preprocess_function(examples, labels):
    tokenized_inputs = tokenizer(examples, padding="max_length", truncation=True, max_length=max_length)
    return {'input_ids': tokenized_inputs['input_ids'], 'attention_mask': tokenized_inputs['attention_mask'], 'labels': labels}

train_encodings = preprocess_function(X_train.tolist(), y_train.tolist())
val_encodings = preprocess_function(X_val.tolist(), y_val.tolist())

train_dataset = Dataset.from_dict(train_encodings)
eval_dataset = Dataset.from_dict(val_encodings)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False  # Desativar FP16 para execução em CPU
)

def compute_metrics(p):
    pred, labels = p.predictions, p.label_ids
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

bert_with_prefix = BertWithPrefix(model, prefix_tokens)

trainer = Trainer(
    model=bert_with_prefix,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Iniciar o treinamento
try:
    train_result = trainer.train()
    eval_result = trainer.evaluate()
    results_dict = {
        "training_loss": train_result.training_loss,
        "eval_accuracy": eval_result['eval_accuracy'],
        "eval_f1": eval_result['eval_f1'],
        "train_runtime": train_result.metrics['train_runtime'],
        "train_samples_per_second": train_result.metrics['train_samples_per_second'],
        "total_flos": train_result.metrics['total_flos']
    }
    with open('./results/PEFT/hedos_bert_prefixtuning_training_results.json', 'w') as f:
        json.dump(results_dict, f)
    print("Resultados salvos com sucesso!")
except Exception as e:
    print(f"Erro durante o treinamento ou avaliação: {e}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['final_label'] = filtered_data['final_label'].map(label_mapping)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.568,0.525624,0.78162,0.786175,0.990708,0.87667
1,0.4945,0.516628,0.783439,0.783439,1.0,0.878571
2,0.5328,0.515868,0.783439,0.783439,1.0,0.878571


Resultados salvos com sucesso!


In [None]:
trainer.evaluate()

{'eval_loss': 0.5158679485321045,
 'eval_accuracy': 0.7834394904458599,
 'eval_precision': 0.7834394904458599,
 'eval_recall': 1.0,
 'eval_f1': 0.8785714285714286,
 'eval_runtime': 438.4244,
 'eval_samples_per_second': 2.507,
 'eval_steps_per_second': 0.315,
 'epoch': 2.994535519125683}

In [None]:
trainer.save_model('./results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel')

In [None]:
# Supondo que 'model' é sua instância de BertForSequenceClassification e 'tokenizer' é sua instância de BertTokenizer
model.save_pretrained("./results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel")
tokenizer.save_pretrained("./results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel")


('./results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel/tokenizer_config.json',
 './results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel/special_tokens_map.json',
 './results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel/vocab.txt',
 './results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel/added_tokens.json',
 './results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel/tokenizer.json')

### Validação

In [None]:
# Carregar o tokenizer
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Carregar o modelo
model_2 = BertForSequenceClassification.from_pretrained("./results/PEFT/HEDOS_BERTimbauPrefixTuningCustomModel")
#model_2.to('cuda')

In [None]:
# Definir o prefixo e configurações de tokenizer se usadas durante o treinamento
prefix_text = "Classifique como tóxico qualquer texto que contenha palavões!"
prefix_tokens = tokenizer(prefix_text, return_tensors="pt", add_special_tokens=False)['input_ids'][0]
max_length = 512 - len(prefix_tokens)  # Ajuste conforme sua configuração

# Textos de exemplo para avaliação
examples = [
    "Você é incrível!",
    "Eu odeio quando você age assim!",
    "Esse lugar é horrível e as pessoas são terríveis.",
    "Que dia maravilhoso para um passeio no parque!",
    "Você é um idiota e ninguém gosta de você.",
    "vá para o inferno",
    "demonho",
    "puta que pariu, eu amei!",
    "nordestino nojento"
]

# Tokenizar os exemplos
encoded_inputs = tokenizer(examples, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Garantir que os inputs estejam no dispositivo CUDA
#encoded_inputs = {key: val.to('cuda') for key, val in encoded_inputs.items()}

# Adicionar prefixo aos inputs
device = 'cpu'  # ou 'cpu' se CUDA não estiver disponível
batch_prefix_tokens = prefix_tokens.to(device).unsqueeze(0).repeat(encoded_inputs['input_ids'].size(0), 1)
extended_input_ids = torch.cat([batch_prefix_tokens, encoded_inputs['input_ids']], dim=1)

# Garantir que a attention mask também esteja ajustada
prefix_mask = torch.ones(batch_prefix_tokens.size(), dtype=torch.long, device=device)
extended_attention_mask = torch.cat([prefix_mask, encoded_inputs['attention_mask']], dim=1)

# Obter previsões
with torch.no_grad():
    outputs = model_2(input_ids=extended_input_ids, attention_mask=extended_attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)
    probabilities = F.softmax(outputs.logits, dim=-1)  # Softmax sobre os logits para probabilidades

# Imprimir os resultados
for text, pred, prob in zip(examples, predictions, probabilities):
    prob_t = prob[pred].item() * 100  # Probabilidade da classe prevista
    print(f"Sentença: {text} - Tóxico: {'Sim' if pred.item() == 1 else 'Não'} - Probabilidade: {prob_t:.2f}%")

Sentença: Você é incrível! - Tóxico: Sim - Probabilidade: 54.90%
Sentença: Eu odeio quando você age assim! - Tóxico: Não - Probabilidade: 51.38%
Sentença: Esse lugar é horrível e as pessoas são terríveis. - Tóxico: Não - Probabilidade: 61.07%
Sentença: Que dia maravilhoso para um passeio no parque! - Tóxico: Sim - Probabilidade: 55.12%
Sentença: Você é um idiota e ninguém gosta de você. - Tóxico: Não - Probabilidade: 50.97%
Sentença: vá para o inferno - Tóxico: Sim - Probabilidade: 80.83%
Sentença: demonho - Tóxico: Sim - Probabilidade: 80.37%
Sentença: puta que pariu, eu amei! - Tóxico: Sim - Probabilidade: 81.92%
Sentença: nordestino nojento - Tóxico: Sim - Probabilidade: 80.91%
