In [14]:
import sys
import os
script_dir = os.getcwd()
sys.path.append(script_dir)

In [15]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

In [16]:
dataset = load_dataset("dair-ai/emotion", trust_remote_code=True)

In [17]:
# Carregar o modelo pré-treinado para classificação

# distilbert-base-uncased
# TinyBERT_General_4L_312D
# bert-base-uncased

model_used = 'distilbert-base-uncased'

In [18]:
MAX_LENGTH = 128

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

tokenizer = DistilBertTokenizer.from_pretrained(model_used)

dataset = dataset.map(tokenize, batched=True)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])



In [20]:
model = DistilBertForSequenceClassification.from_pretrained(model_used, num_labels=6)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir='./results',          # Diretório para salvar os resultados
    evaluation_strategy="epoch",    # Avaliar o modelo ao final de cada época
    per_device_train_batch_size=8,  # Tamanho do batch de treinamento
    per_device_eval_batch_size=16,  # Tamanho do batch de validação
    num_train_epochs=3,             # Número de épocas de treinamento
    weight_decay=0.01,              # Taxa de decaimento de peso
    logging_dir='./logs',           # Diretório para os logs
    logging_steps=10,
    no_cuda=not torch.cuda.is_available()  # Garantir que a GPU é usada se disponível
)

# Criar o objeto de treinamento do Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    compute_metrics=compute_metrics
)

trainer.train()
eval_result = trainer.evaluate()

print("Resultados da avaliação:", eval_result)


  0%|          | 0/6000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalAveragePooling1D
from tensorflow.keras.utils import to_categorical

In [None]:
# Parâmetros
MAX_WORDS = 10000   # Número máximo de palavras a considerar
MAX_LENGTH = 128    # Comprimento máximo das sequências
EMBEDDING_DIM = 100 # Dimensão do embedding

# Preparar os dados
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
val_texts = dataset['validation']['text']
val_labels = dataset['validation']['label']

# Tokenização
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

X_train = pad_sequences(train_sequences, maxlen=MAX_LENGTH)
X_val = pad_sequences(val_sequences, maxlen=MAX_LENGTH)

# Transformar labels para one-hot encoding
num_classes = len(set(train_labels))
y_train = to_categorical(train_labels, num_classes=num_classes)
y_val = to_categorical(val_labels, num_classes=num_classes)

In [None]:
# Construir o modelo
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    GlobalAveragePooling1D(),  # Reduz a dimensionalidade mantendo a média das características
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compilar o modelo
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Treinar o modelo
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3303 - loss: 1.5895 - val_accuracy: 0.3520 - val_loss: 1.5710
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.3516 - loss: 1.5650 - val_accuracy: 0.3015 - val_loss: 1.5713
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.4147 - loss: 1.5068 - val_accuracy: 0.4165 - val_loss: 1.4280
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.5092 - loss: 1.3342 - val_accuracy: 0.5715 - val_loss: 1.2041
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6056 - loss: 1.0805 - val_accuracy: 0.6425 - val_loss: 0.9818
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6709 - loss: 0.8833 - val_accuracy: 0.7035 - val_loss: 0.8768
Epoch 7/10
[1m500/500[0m 