In [1]:
# Importar librerías necesarias
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

# Paso 1: Cargar el dataset
data = pd.read_csv('data/tweet_emotions.csv')

# Paso 2: Preprocesamiento y simplificación de clases
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Eliminar caracteres especiales y números
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenizar y eliminar palabras cortas (menos de 3 caracteres)
    tokens = text.split()
    tokens = [word for word in tokens if len(word) > 2]
    return ' '.join(tokens)

# Simplificar clases menores
def simplify_classes(label):
    minor_classes = ['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'hate', 'relief', 'surprise']
    return label if label not in minor_classes else 'other'

# Aplicar preprocesamiento y simplificación
data['clean_content'] = data['content'].apply(preprocess_text)
data['simplified_sentiment'] = data['sentiment'].apply(simplify_classes)

# Paso 3: División de datos
X_train, X_test, y_train, y_test = train_test_split(
    data['clean_content'], 
    data['simplified_sentiment'], 
    test_size=0.2, 
    random_state=42
)

# Paso 4: Generar embeddings usando DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embeddings(texts, tokenizer, model, max_length=128):
    embeddings = []
    for text in tqdm(texts, desc="Generating BERT embeddings"):
        encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings.append(output.last_hidden_state[:, 0, :].squeeze().numpy())
    return embeddings

X_train_bert = get_bert_embeddings(X_train, tokenizer, bert_model)
X_test_bert = get_bert_embeddings(X_test, tokenizer, bert_model)

# Paso 5: Entrenar un clasificador
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(X_train_bert, y_train)

# Paso 6: Predicción y evaluación
y_pred = classifier.predict(X_test_bert)
report = classification_report(y_test, y_pred)
print(report)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating BERT embeddings:   2%|▏         | 486/32000 [02:44<2:58:09,  2.95it/s]


KeyboardInterrupt: 