## Requirements

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import joblib

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def correct_dataframe(df, label_columns):
    """
    Corrects a DataFrame by ensuring label columns are numeric and handling missing values.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - label_columns (list): A list of column names to be treated as labels.

    Returns:
    - pd.DataFrame: The corrected DataFrame.
    """
    # Convert label columns to numeric
    try:
        df[label_columns] = df[label_columns].apply(pd.to_numeric, errors='coerce')
    except Exception as e:
        print(f"Error during conversion to numeric: {e}")

    # Identify and print problematic rows
    problematic_rows = df[df[label_columns].applymap(type).eq(object).any(axis=1)]
    if not problematic_rows.empty:
        print("Problematic rows with non-numeric elements:")
        print(problematic_rows)

    # Fill missing values with 0
    df[label_columns] = df[label_columns].fillna(0)

    return df

## Load and split data

In [None]:
# Load the local dataset
labels_list = ['ageism', 'aporophobia', 'body_shame', 'capacitism', 'lgbtphobia', 'political',
               'racism', 'religious_intolerance', 'misogyny', 'xenophobia', 'other']
df = pd.read_csv("https://raw.githubusercontent.com/Silly-Machine/TuPi-Portuguese-Hate-Speech-Dataset/main/datasets/tupi_hierarchy.csv")
df['not_hate'] = df[labels_list].apply(lambda row: 1 if row.sum() == 0 else 0, axis=1)
labels_list.append('not_hate')
df = correct_dataframe(df,labels_list)
df = df[['text']+labels_list]

In [None]:
#Labels list
id2label = {idx:label for idx, label in enumerate(labels_list)}
label2id = {label:idx for idx, label in enumerate(labels_list)}

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(labels_list),
                                                      id2label=id2label,
                                                      label2id=label2id)

# Split the dataset into training and validation sets (stratified)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df[labels_list], test_size=0.2, random_state=42, stratify=df['not_hate']
)

# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Create PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()  # Convert the label to a long tensor
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets and dataloaders
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)


## Train

In [None]:
# Create PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).long()  # Convert the label to a long tensor
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets and dataloaders
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 10
num_warmup_steps = 0  # You may adjust this based on your specific requirements
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=len(train_dataset) * num_epochs)


# Training loop
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    total_batches = len(train_loader)
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_loader, 1):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        # Assuming your model returns logits, not loss directly
        logits = outputs.logits

        # Use BCEWithLogitsLoss for multi-label classification
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels.float())  # Convert labels to float

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{total_batches}, Loss: {loss.item()}")

    average_loss = total_loss / total_batches
    print(f"Epoch {epoch + 1} - Average Loss: {average_loss}")




Epoch 1/10


  item['labels'] = torch.tensor(self.labels[idx]).long()  # Convert the label to a long tensor


Batch 100/1092, Loss: 0.13500964641571045
Batch 200/1092, Loss: 0.1386624127626419
Batch 300/1092, Loss: 0.10018342733383179
Batch 400/1092, Loss: 0.11826922744512558
Batch 500/1092, Loss: 0.060162078589200974
Batch 600/1092, Loss: 0.10906629264354706
Batch 700/1092, Loss: 0.095174640417099
Batch 800/1092, Loss: 0.0988934263586998
Batch 900/1092, Loss: 0.12599794566631317
Batch 1000/1092, Loss: 0.05508360266685486
Epoch 1 - Average Loss: 0.1101217048716878

Epoch 2/10
Batch 100/1092, Loss: 0.07913634926080704
Batch 200/1092, Loss: 0.06320416927337646
Batch 300/1092, Loss: 0.07058559358119965
Batch 400/1092, Loss: 0.08023186773061752
Batch 500/1092, Loss: 0.0650903582572937
Batch 600/1092, Loss: 0.12923520803451538
Batch 700/1092, Loss: 0.031534343957901
Batch 800/1092, Loss: 0.09747114032506943
Batch 900/1092, Loss: 0.069057397544384
Batch 1000/1092, Loss: 0.057367488741874695
Epoch 2 - Average Loss: 0.0662623592115079

Epoch 3/10
Batch 100/1092, Loss: 0.06476868689060211
Batch 200/109