In [1]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

In [2]:
df = pd.read_csv("dataset_001.csv")
df.head(3)

Unnamed: 0,id,text,tags
0,1,Um jovem camponês descobre um antigo artefato ...,fantasia|aventura
1,2,Dois colegas de trabalho desenvolvem sentiment...,romance|slice_of_life
2,3,"Após morrer em um acidente, um programador ren...",fantasia|isekai|aventura


In [3]:
X = df["text"].values
y_raw = df["tags"].apply(lambda x: x.split("|"))

In [4]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)

print("Labels:", mlb.classes_)
print("Shape y:", y.shape)

Labels: ['acao' 'aventura' 'comedia' 'crime' 'dark' 'distopia' 'drama' 'fantasia'
 'ficcao_cientifica' 'guerra' 'isekai' 'mistério' 'mitologia'
 'pos_apocaliptico' 'romance' 'slice_of_life' 'suspense' 'terror'
 'thriller' 'tragédia' 'urbano']
Shape y: (40, 21)


In [5]:
class NovelDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(
            self.labels[idx],
            dtype=torch.float
        )

        return item


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

X_train: (32,)
y_train: (32, 21)


In [7]:
pretrained_model_name = "neuralmind/bert-base-portuguese-cased"

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name
)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = NovelDataset(
    X_train,
    y_train,
    tokenizer
)

val_dataset = NovelDataset(
    X_test,
    y_test,
    tokenizer
)


In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))  # sigmoid
    preds = (probs > 0.5).astype(int)

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro"
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Testando a classificação

In [12]:
text = "Um jovem é transportado para um mundo mágico cheio de perigos."

inputs = tokenizer(text, return_tensors="pt", truncation=True)

with torch.no_grad():
    logits = model(**inputs).logits

probs = torch.sigmoid(logits).numpy()[0]

for label, prob in zip(mlb.classes_, probs):
    if prob > 0.4:
        print(label, round(prob, 2))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


acao 0.49
aventura 0.51
comedia 0.49
crime 0.45
dark 0.55
distopia 0.56
drama 0.46
fantasia 0.52
ficcao_cientifica 0.48
guerra 0.45
isekai 0.5
mistério 0.49
mitologia 0.53
pos_apocaliptico 0.54
romance 0.42
slice_of_life 0.49
suspense 0.5
terror 0.45
thriller 0.58
tragédia 0.55
urbano 0.5
