In [None]:
! pip install accelerate -U
! pip install transformers[torch]

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
import torch
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
torch.cuda.empty_cache()


# Enable GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_name = "tupi-bert-large-portuguese-cased-multiclass-multilabel"
model_tokenizer_path = f"FpOliveira/{model_name}"

# Define features and target
x_name = "text"
label_columns = ['ageism', 'aporophobia', 'body_shame', 'capacitism', 'lgbtphobia', 'political',
                 'racism', 'religious_intolerance', 'misogyny', 'xenophobia', 'other']


# Load and split the dataset into training and validation sets (stratified)
df = pd.read_csv("https://raw.githubusercontent.com/Silly-Machine/TuPi-Portuguese-Hate-Speech-Dataset/main/datasets/tupi_hierarchy.csv")
df['not_hate'] = df[label_columns].apply(lambda row: 1 if row.sum() == 0 else 0, axis=1)
label_columns.append('not_hate')
df = df[['text']+label_columns]
train_texts, val_texts, train_labels, val_labels = train_test_split(df[x_name], df[label_columns], test_size=0.2, random_state=42, stratify=df['not_hate'])
test_dataset = pd.DataFrame({x_name: val_texts})
test_dataset[label_columns] = pd.DataFrame(val_labels[label_columns].values.tolist(), index=test_dataset.index)

# Define the task
task_name = "hate_multi_label_classification"
num_labels = len(labels_list)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("FpOliveira/tupi-gpt2-small")

# Load the pre-trained GPT-3 model
model = AutoModelForSequenceClassification.from_pretrained("FpOliveira/tupi-gpt2-small",num_labels=len(labels_list),ignore_mismatched_sizes=True)

# Add a classification head on top of the model
# model.resize_token_embeddings(len(tokenizer))
# model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)

# set the pad token in the tokenizer and model
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)

# Check tokenization output
# Check tokenization output
print(len(train_encodings['input_ids'][0]))

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at FpOliveira/tupi-gpt2-small and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


432


In [None]:
# Adjust the model's output layer
model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3
num_warmup_steps = 0  # You may adjust this based on your specific requirements
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=len(train_dataset) * num_epochs)

# Training loop
torch.cuda.empty_cache()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    train_dataloader = DataLoader(train_dataset, batch_size=38, shuffle=True)

    total_batches = len(train_dataloader)
    total_loss = 0.0

    for batch_idx, batch in enumerate(train_dataloader, 1):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        # Assuming your model returns logits, not loss directly
        logits = outputs.logits

        # Use BCEWithLogitsLoss for multi-label classification
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(logits, labels.float())  # Convert labels to float

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Batch {batch_idx}/{total_batches}, Loss: {loss.item()}")

    average_loss = total_loss / total_batches
    print(f"Epoch {epoch + 1} - Average Loss: {average_loss}")




Epoch 1/3


  item['labels'] = torch.tensor(self.labels[idx])


Batch 100/920, Loss: 0.11196088790893555
Batch 200/920, Loss: 0.08852661401033401
Batch 300/920, Loss: 0.11607339233160019
Batch 400/920, Loss: 0.0970134437084198
Batch 500/920, Loss: 0.03887900337576866
Batch 600/920, Loss: 0.13669031858444214
Batch 700/920, Loss: 0.04247033968567848
Batch 800/920, Loss: 0.0601988285779953
Batch 900/920, Loss: 0.028390266001224518
Epoch 1 - Average Loss: 0.10353214330161395

Epoch 2/3
Batch 100/920, Loss: 0.08226053416728973
Batch 200/920, Loss: 0.08628883957862854
Batch 300/920, Loss: 0.03594090789556503
Batch 400/920, Loss: 0.07123199850320816
Batch 500/920, Loss: 0.08268043398857117
Batch 600/920, Loss: 0.048679694533348083
Batch 700/920, Loss: 0.05581311509013176
Batch 800/920, Loss: 0.07852859795093536
Batch 900/920, Loss: 0.08576615899801254
Epoch 2 - Average Loss: 0.07392872004648265

Epoch 3/3
Batch 100/920, Loss: 0.0466112457215786
Batch 200/920, Loss: 0.038809917867183685
Batch 300/920, Loss: 0.1086466982960701
Batch 400/920, Loss: 0.1091292