## Requirements

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import joblib

## Load and split data

In [None]:
# Load the local dataset
df = pd.read_csv("https://raw.githubusercontent.com/Silly-Machine/TuPi-Portuguese-Hate-Speech-Dataset/main/datasets/tupi_binary.csv")
df.head()

Unnamed: 0,source,id,text,researcher,year,aggressive,hate
0,twitter,1.65848623693028e+18,@user @user @user quanto vc pagava na época da...,oliveira et al,2023,1,1
1,twitter,1.65848623777333e+18,@user os árabes já vão lhes chutar do país ??,oliveira et al,2023,1,1
2,twitter,1.65848960585394e+18,@user @user @user @user @user tem que desenhar...,oliveira et al,2023,1,1
3,twitter,1.65849012716374e+18,@user @user chola mais gado. e se não quiser p...,oliveira et al,2023,1,1
4,twitter,1.65849018793945e+18,michele micheque nao tinha cartao do bolsonaro...,oliveira et al,2023,1,1


In [None]:
#Tokenizer and model initialization
model_name = "bert-large-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(f"neuralmind/{model_name}")
model = BertForSequenceClassification.from_pretrained(f"neuralmind/{model_name}", num_labels=2)  # Assuming binary classification


tokenizer_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Split the dataset into training and validation sets (stratified)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['hate'], test_size=0.2, random_state=42, stratify=df['hate']
)

In [None]:
# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

In [None]:
# Create PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()  # Convert the label to a long tensor
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

## Train

In [None]:
# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 10
num_warmup_steps = 0  # You may adjust this based on your specific requirements
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=len(train_dataset) * num_epochs)



In [None]:
# Training loop
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    total_batches = len(train_loader)
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader, 1):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{total_batches}, Loss: {loss.item()}")

    average_loss = total_loss / total_batches
    print(f"Epoch {epoch + 1} - Average Loss: {average_loss}")


Epoch 1/10


  item['labels'] = torch.tensor(self.labels[idx]).long()  # Convert the label to a long tensor


Batch 100/1092, Loss: 0.334164023399353
Batch 200/1092, Loss: 0.1297953724861145
Batch 300/1092, Loss: 0.30300962924957275
Batch 400/1092, Loss: 0.4419761896133423
Batch 500/1092, Loss: 0.2576327323913574
Batch 600/1092, Loss: 0.2348480075597763
Batch 700/1092, Loss: 0.249099001288414
Batch 800/1092, Loss: 0.20291490852832794
Batch 900/1092, Loss: 0.14401765167713165
Batch 1000/1092, Loss: 0.23460420966148376
Epoch 1 - Average Loss: 0.2598181171752103

Epoch 2/10
Batch 100/1092, Loss: 0.20399373769760132
Batch 200/1092, Loss: 0.22314023971557617
Batch 300/1092, Loss: 0.3232797384262085
Batch 400/1092, Loss: 0.21971988677978516
Batch 500/1092, Loss: 0.2581614851951599
Batch 600/1092, Loss: 0.28029078245162964
Batch 700/1092, Loss: 0.07725808024406433
Batch 800/1092, Loss: 0.29618850350379944
Batch 900/1092, Loss: 0.13887713849544525
Batch 1000/1092, Loss: 0.10890233516693115
Epoch 2 - Average Loss: 0.18782750081148122

Epoch 3/10
Batch 100/1092, Loss: 0.17806586623191833
Batch 200/1092,