In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, hamming_loss
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df = pd.read_csv("dataset_001.csv")
df.head(3)

Unnamed: 0,id,text,tags
0,1,Um jovem camponês descobre um antigo artefato ...,fantasia|aventura
1,2,Dois colegas de trabalho desenvolvem sentiment...,romance|slice_of_life
2,3,"Após morrer em um acidente, um programador ren...",fantasia|isekai|aventura


In [3]:
X = df["text"].values
y_raw = df["tags"].apply(lambda x: x.split("|"))

In [4]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)

print("Labels:", mlb.classes_)
print("Shape y:", y.shape)

Labels: ['acao' 'aventura' 'comedia' 'crime' 'dark' 'distopia' 'drama' 'fantasia'
 'ficcao_cientifica' 'guerra' 'isekai' 'mistério' 'mitologia'
 'pos_apocaliptico' 'romance' 'slice_of_life' 'suspense' 'terror'
 'thriller' 'tragédia' 'urbano']
Shape y: (40, 21)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

X_train: (32,)
y_train: (32, 21)


In [6]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=100_000,
        ngram_range=(1, 2),
        min_df=5
    )),
    ("clf", OneVsRestClassifier(
        LogisticRegression(
            max_iter=1000,
            class_weight="balanced"
        )
    ))
])

pipeline.fit(X_train, y_train)

In [7]:
y_pred = pipeline.predict(X_test)

# Accuracy não serve como parametro nesse caso
print("F1 micro:", f1_score(y_test, y_pred, average="micro"))
print("F1 macro:", f1_score(y_test, y_pred, average="macro"))
print("Hamming loss:", hamming_loss(y_test, y_pred))

F1 micro: 0.2571428571428572
F1 macro: 0.15340136054421769
Hamming loss: 0.30952380952380953


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
