In [7]:
import nltk
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AdamW
import torch

In [21]:
df = pd.read_csv(r"data.csv", encoding='utf-8')
print(df)
print(len(df['Emotion'].unique()))

                                                    Text   Emotion
9851   i feel dumb packing when i can t even get a st...   sadness
2674   i hi tech color club holiday splendor sally ha...     happy
11472  i was feeling this really weird sense of isola...  surprise
17715      i woke up about am feeling a little disturbed   sadness
12788  i am excited about new traditions with loved o...     happy
...                                                  ...       ...
2378     im postponing feeling virtuous about this labor     happy
6650   i feel hopeful like things are going to be gre...     happy
16809  i feel defeated but its okay hahaha my mid ter...   sadness
10588  i am back in the shire and although it is love...      fear
20385  Back in his office , McCready threw himself in...   sadness

[100 rows x 2 columns]
6


In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

stop_words = set(nltk.corpus.stopwords.words('english'))
tokenized = tokenizer(df['Text'].tolist(), padding=True, truncation=True, max_length=512, 
                      return_attention_mask=True)
# Eliminar las stopwords de los tokens
tokenized_without_stopwords = []
for i in range(len(tokenized['input_ids'])):
    tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'][i])
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]
    input_ids = tokenizer.convert_tokens_to_ids(tokens_without_stopwords)
    attention_mask = [1] * len(input_ids)
    tokenized_without_stopwords.append({'input_ids': input_ids, 'attention_mask': attention_mask})

# Convertir las etiquetas a valores numéricos
labels = df['Emotion'].astype('category').cat.codes.tolist()


In [14]:
X_train, X_test, y_train, y_test = train_test_split(tokenized['input_ids'], labels, 
                                                    test_size=0.2, random_state=42)

In [15]:
# Inicializar el modelo
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', 
                                                      num_labels=len(df['Emotion'].unique()))

# Configurar el dispositivo a GPU si está disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Inicializar el optimizador
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Ajustar el modelo en el conjunto de entrenamiento
model.train()
for epoch in range(3):
    for batch in range(len(X_train)):
        input_ids = torch.tensor(X_train[batch]).unsqueeze(0).to(device)
        labels = torch.tensor(y_train[batch]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(tokenized['attention_mask'][batch]).unsqueeze(0).to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [6]:
# Evaluar el modelo en el conjunto de prueba
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in range(len(X_test)):
        input_ids = torch.tensor(X_test[batch]).unsqueeze(0).to(device)
        labels = torch.tensor(y_test[batch]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(tokenized['attention_mask'][batch]).unsqueeze(0).to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        _, predicted = torch.max(outputs[1], 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f'Precisión: {accuracy:.2f}%')
print(f"total {total}" )
print(f"correcto {correct}" )

Precisión: 31.50%
total 200
correcto 63
