In [1]:
import numpy as np
from joblib import dump
import pandas as pd
pd.set_option('display.max_colwidth', 180)

## Leer los datos

El dataset se eleigió de Kaggle https://www.kaggle.com/code/shtrausslearning/twitter-emotion-classification/notebook

El cual ya se limpió previamente, por lo que solo cambiaremos algunas contracciones en sus "palabras originales", aplicaremos lematización y después lo convertiremos a embeddings con un Transfromer

In [2]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/training.csv')
validation = pd.read_csv('data/validation.csv')

print('Test shape:', test.shape)
print('Train shape:', train.shape)
print('Validation shape:', validation.shape)

Test shape: (2000, 2)
Train shape: (16000, 2)
Validation shape: (2000, 2)


In [3]:
test.head(10)

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambitious right now,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i don t ever want her to feel like i m ashamed with her,0
3,i left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when i arrived,1
4,i was feeling a little vain when i did this one,0
5,i cant walk into a shop anywhere where i do not feel uncomfortable,4
6,i felt anger when at the end of a telephone call,3
7,i explain why i clung to a relationship with a boy who was in many ways immature and uncommitted despite the excitement i should have been feeling for getting accepted into the...,1
8,i like to have the same breathless feeling as a reader eager to see what will happen next,1
9,i jest i feel grumpy tired and pre menstrual which i probably am but then again its only been a week and im about as fit as a walrus on vacation for the summer,3


In [4]:
import re

# Dictionary of replacements
replacements = {
    "i" : "I",
    "im" : "I am",
    "Im" : "I am",
    "i m" : "I am",
    "I m" : "I am",
    "hadn t" : "had not",
    "hasn t" : "has not",
    "haven t" : "have not", 
    "don t" : "do not",
    "can t" : "cannot",
    "didn t" : "did not",
    "aren t" : "are not",
    "isn t" : "is not",
    "it s" : "it is",
    "ive" : "I have",
    "it d" : "it would",
    "how d" : "how did",
    "could ve" : "could have",
    "cuz" : "because",
    "gotta" : "got to",
    "kinda" : "kind of",
    "lemme" : "let me",
    "o clock" : "of the clock",
    "y ever" : "have you ever",
    "y know" : "you know",
    "you ll" : "you will",
    "you d" : "you had",
    "why s" : "why is",
    "why re" : "why are",
    "won t" : "will not",
    "would ve": "would have",
    "til" : "until",
    "tis" : "it is",
    "somebody s": "somebody is",
    "someone s": "someone is",
    "mine s" : "mine is"
}

# Function to apply replacements
def replace_words(text, replacements):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in replacements.keys()) + r')\b', re.IGNORECASE)
    return pattern.sub(lambda x: replacements[x.group().lower()], text)

# Remove the contractions in each DataFrame
test['text'] = test['text'].apply(replace_words, replacements=replacements)
train['text'] = train['text'].apply(replace_words, replacements=replacements)
validation['text'] = validation['text'].apply(replace_words, replacements=replacements)

In [5]:
test.head()

Unnamed: 0,text,label
0,I am feeling rather rotten so I am not very ambitious right now,0
1,I am updating my blog because I feel shitty,0
2,I never make her separate from me because I do not ever want her to feel like I m ashamed with her,0
3,I left with my bouquet of red and yellow tulips under my arm feeling slightly more optimistic than when I arrived,1
4,I was feeling a little vain when I did this one,0


Convert tokens into embeddings with a Transformer trained for sentiment analysis in tweets. https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") # obtiene un ID para cada palabra
model = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment") # "mapea los ID a embeddings"

  from .autonotebook import tqdm as notebook_tqdm
2025-01-04 22:22:45.518676: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-04 22:22:45.591059: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

## Convertir el texto a embeddings

- Dividir el texto en tokens. `tokenizer` Convierte cada token en un número (su ID en el vocabulario del modelo). Añadiendo tokens especiales: ([CLS] al inicio y [SEP] al final) Cuenta con los siguientes parametros:
    - `padding=True`: Rellena con tokens especiales [PAD] hasta que todos los textos del batch tengan la misma longitud
    - `truncation=True`: Corta los textos que excedan max_length
    - `max_length=512`: Longitud máxima permitida
    - `return_tensors="pt"`: Devuelve tensores de PyTorch

- El resultado encoded es un diccionario que contiene:
    - `input_ids`: IDs numéricos de cada token
    - `attention_mask`: 1s para tokens reales, 0s para padding

- El modelo acepta este diccionario y procesa los tokens a través de varias capas de transformers. Cada token interactúa con todos los demás tokens mediante self-attention. Generando representaciones contextuales para cada token


- `outputs.last_hidden_state[:, 0, :]` obtiene el embedding del token [CLS}, ya que tiene la forma de `[batch_size, sequence_length, hidden_size]`
    - primer índice: selecciona todos los elementos del batch
    - segundo índice: selecciona solo el primer token ([CLS])
    - tercer índice: selecciona todas las dimensiones del embedding

- El token [CLS] es importante porque el modelo está entrenado para que este token capture la información semántica de toda la secuencia


In [7]:
text = "I love programming!"

# Tokenize the text
res = tokenizer(text)
print(res)
# Tokens
tokens = tokenizer.tokenize(text)
print(tokens)
# Ids
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
# Decode
decoded = tokenizer.decode(ids)
print(decoded)

{'input_ids': [0, 100, 657, 8326, 328, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}
['I', 'Ġlove', 'Ġprogramming', '!']
[100, 657, 8326, 328]
I love programming!


In [22]:
def get_embeddings(df, tokenizer, model, batch_size=16, output_file=None):
    model.eval()  # Poner el modelo en modo evaluación
    embeddings = []
    texts = df['text'].tolist()

    # Procesar los textos en batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Tokenizar el batch completo
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        # Obtener embeddings
        with torch.no_grad():
            outputs = model(**encoded)

        # Obtener embeddings del token [CLS]
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings.append(batch_embeddings)

    # Concatenar todos los embeddings
    embeddings = np.vstack(embeddings)

    # Guardar embeddings si se especifica un archivo de salida
    if output_file:
        dump(embeddings, output_file)
        print(f"Embeddings guardados en: {output_file}")

    return embeddings

In [9]:
test_embeddings = get_embeddings(test, tokenizer, model, batch_size=32,output_file='data/test_embeddings.pkl')
train_embeddings = get_embeddings(train, tokenizer, model, batch_size=32,output_file='data/train_embeddings.pkl')
validation_embeddings = get_embeddings(validation, tokenizer, model, batch_size=32,output_file='data/validation_embeddings.pkl')

# Imprimir forma del resultado
print(f"Test embedding shape: {test_embeddings.shape}")
print(f"Train embedding shape: {train_embeddings.shape}")
print(f"Validation embedding shape: {validation_embeddings.shape}")

{'input_ids': tensor([[   0,  100,  524,  ...,    1,    1,    1],
        [   0,  100,  524,  ...,    1,    1,    1],
        [   0,  100,  393,  ...,    1,    1,    1],
        ...,
        [   0,  100, 2200,  ...,    1,    1,    1],
        [   0,  100,   33,  ...,    1,    1,    1],
        [   0,  100, 1278,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
{'input_ids': tensor([[   0,  100,  386,  ...,    1,    1,    1],
        [   0,  100,  619,  ...,    1,    1,    1],
        [   0,  100,  619,  ...,    1,    1,    1],
        ...,
        [   0,  100,  619,  ...,    1,    1,    1],
        [   0,  100, 1240,  ...,   19, 5450,    2],
        [   0,  100,   64,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0,

KeyboardInterrupt: 