In [1]:
#training data

In [2]:
import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv("eng_train.csv")

# Verificar la estructura del archivo
df

Unnamed: 0,id,text,Joy,Fear,Anger,Sadness,Surprise
0,eng_train_track2_001,None of us has mentioned the incident since.,0,1,0,2,1
1,eng_train_track2_002,"I was 7 and woke up early, so I went to the ba...",1,0,0,0,0
2,eng_train_track2_003,By that point I felt like someone was stabbing...,0,3,0,0,0
3,eng_train_track2_004,watching her leave with dudes drove me crazy.,0,1,3,1,0
4,eng_train_track2_005,`` My eyes widened.,0,1,0,0,2
...,...,...,...,...,...,...,...
2763,eng_train_track2_2764,"My face is cold, and my hands are guilty.",0,1,0,1,0
2764,eng_train_track2_2765,I remembered how I dragged his box into the be...,1,0,0,0,0
2765,eng_train_track2_2766,As I walked in the door she came around the co...,3,0,0,0,1
2766,eng_train_track2_2767,They kept me at the hospital for 24 hours-and ...,0,1,0,1,0


In [3]:
import spacy
import re
import nltk
import unicodedata
import requests
from spacy_syllables import SpacySyllables
from bs4 import BeautifulSoup
from nltk import TweetTokenizer
from spacy.lang.es import Spanish
from spacy.lang.en import English
from nltk.util import ngrams
import pandas as pd
import contractions  # Importamos la librería para expandir contracciones
import unicodedata



#df = pd.DataFrame(df_final)

# Clase TextProcessing ya definida anteriormente
class TextProcessing(object):
    name = 'Text Processing'
    lang = 'en'

    def __init__(self, lang: str = 'en'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def expand_contractions(text: str) -> str:
        """Expande las contracciones en el texto."""
        try:
            return contractions.fix(text)
        except Exception as e:
            print('Error expand_contractions: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = TextProcessing.expand_contractions(text_out)  # Expandimos las contracciones
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))


# Función para aplicar el preprocesamiento al DataFrame
def apply_preprocessing_to_df(df: pd.DataFrame, column: str) -> pd.DataFrame:
    tp = TextProcessing(lang='en')  # Inicializamos el preprocesador en inglés
    df[f'{column}_processed'] = df[column].apply(lambda x: tp.transformer(x))
    return df


In [4]:
#Emotion weights in VAD
vad_values2 = {
    "Anger": {"V": 0.167, "A": 0.865, "D": 0.657},
    "Fear": {"V": 0.73, "A": 0.840, "D": 0.293},
    "Joy": {"V": 0.980, "A": 0.824, "D": 0.794},
    "Sadness": {"V": 0.52, "A": 0.288, "D": 0.164},
    "Surprise": {"V": 0.875, "A": 0.875, "D": 0.562},
}


from sklearn.model_selection import train_test_split



# Función para calcular V, A y D
def calculate_vad(row):
    emotions = ["Anger", "Fear", "Joy", "Sadness", "Surprise"]
    total_intensity = sum(row[emotion] for emotion in emotions)
    if total_intensity == 0:
        return 0.500, 0.500, 0.500  # Valores para "No Emotion"

    V = sum(row[emotion] * vad_values2[emotion]["V"] for emotion in emotions) / total_intensity
    A = sum(row[emotion] * vad_values2[emotion]["A"] for emotion in emotions) / total_intensity
    D = sum(row[emotion] * vad_values2[emotion]["D"] for emotion in emotions) / total_intensity
    return round(V, 3), round(A, 3), round(D, 3)

# Aplicar la función para calcular V, A y D
df[["V", "A", "D"]] = df.apply(calculate_vad, axis=1, result_type="expand")

# Preprocesar texto
tp = TextProcessing(lang='en')
df["text_processed"] = df["text"].apply(lambda x: tp.transformer(x))

# Dividir los datos en entrenamiento y validación (80-20)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Mostrar resultados
print(train_df.head())
print(val_df.head())


                         id  \
2124  eng_train_track2_2125   
2716  eng_train_track2_2717   
2232  eng_train_track2_2233   
261    eng_train_track2_262   
2059  eng_train_track2_2060   

                                                   text  Joy  Fear  Anger  \
2124  26 January 2011 @ 04:45 pm Boys & Girls 718 Cr...    0     2      0   
2716  We headed north on a sunny Saturday morning in...    1     0      0   
2232  I looked down to find five small white arrows ...    0     3      0   
261                         I've never gone back there.    0     2      0   
2059                                     My heart sank.    0     0      0   

      Sadness  Surprise      V      A      D  \
2124        0         0  0.730  0.840  0.293   
2716        0         0  0.980  0.824  0.794   
2232        3         2  0.688  0.642  0.312   
261         2         0  0.625  0.564  0.228   
2059        2         0  0.520  0.288  0.164   

                                         text_processed  
212

In [5]:
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import unicodedata
from spacy_syllables import SpacySyllables
import spacy
from sklearn.model_selection import train_test_split
from torch.nn.functional import softmax
# Tokenización con BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def BERT_tokenization(df, text_column):
    input_ids = []
    attention_masks = []

    for sent in df[text_column]:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_input_ids, train_attention_masks = BERT_tokenization(train_df, "text_processed")
val_input_ids, val_attention_masks = BERT_tokenization(val_df, "text_processed")

# Convertir etiquetas
train_labels = torch.tensor(train_df[["V", "A", "D"]].values, dtype=torch.float32)
val_labels = torch.tensor(val_df[["V", "A", "D"]].values, dtype=torch.float32)

# Crear datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)


2025-02-27 22:48:38.440791: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-27 22:48:38.452031: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740696518.464895  214662 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740696518.468769  214662 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 22:48:38.483712: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [6]:
#TRAIN
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import pearsonr  # Para calcular Pearson r
from tqdm import tqdm
from transformers import BertForSequenceClassification, AdamW, get_scheduler

batch_size = 8

# Nota: Asegúrate de que train_dataset y val_dataset estén correctamente definidos.
train_dataloader  = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Configurar el modelo BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,  # Tres etiquetas: V, A, D
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Configurar optimizador y scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 3
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

def evaluate_metrics(predictions, labels):
    """
    Calcula métricas de clasificación generales (a partir del argmax)
    y calcula el Pearson r para cada dimensión (V, A y D) individualmente.
    """
    # Para las métricas de clasificación usamos argmax
    pred_flat = np.argmax(predictions, axis=1)
    labels_flat = np.argmax(labels, axis=1)

    accuracy = accuracy_score(labels_flat, pred_flat)
    precision_micro = precision_score(labels_flat, pred_flat, average="micro")
    precision_macro = precision_score(labels_flat, pred_flat, average="macro")
    recall_micro = recall_score(labels_flat, pred_flat, average="micro")
    recall_macro = recall_score(labels_flat, pred_flat, average="macro")
    f1_micro = f1_score(labels_flat, pred_flat, average="micro")
    f1_macro = f1_score(labels_flat, pred_flat, average="macro")
    
    # Ahora, para Pearson r, se calcula por cada emoción (cada columna)
    # Se asume que las dimensiones son: índice 0 = V, 1 = A y 2 = D.
    emotion_names = ['V', 'A', 'D']
    pearson_r_dict = {}
    for i in range(predictions.shape[1]):
        pred_i = predictions[:, i]
        label_i = labels[:, i]
        if np.std(pred_i) == 0 or np.std(label_i) == 0:
            pearson_r_i = float('nan')
        else:
            pearson_r_i, _ = pearsonr(pred_i, label_i)
        pearson_r_dict[emotion_names[i]] = pearson_r_i

    return {
        "accuracy": accuracy,
        "precision_micro": precision_micro,
        "precision_macro": precision_macro,
        "recall_micro": recall_micro,
        "recall_macro": recall_macro,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "pearson_r": pearson_r_dict
    }

def train_and_evaluate():
    metrics_per_epoch = []

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Fase de entrenamiento
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_dataloader, desc="Training"):
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Fase de evaluación
        model.eval()
        eval_predictions = []
        eval_labels = []

        for batch in tqdm(validation_dataloader, desc="Validation"):
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_attention_mask)
                logits = outputs.logits

            eval_predictions.extend(logits.detach().cpu().numpy())
            eval_labels.extend(b_labels.detach().cpu().numpy())

        metrics = evaluate_metrics(np.array(eval_predictions), np.array(eval_labels))
        metrics["train_loss"] = avg_train_loss
        metrics_per_epoch.append(metrics)
        
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision (Micro): {metrics['precision_micro']:.4f}, Precision (Macro): {metrics['precision_macro']:.4f}")
        print(f"Recall (Micro): {metrics['recall_micro']:.4f}, Recall (Macro): {metrics['recall_macro']:.4f}")
        print(f"F1 (Micro): {metrics['f1_micro']:.4f}, F1 (Macro): {metrics['f1_macro']:.4f}")
        # Imprimir Pearson r para cada emoción
        for emo, pr in metrics['pearson_r'].items():
            print(f"Pearson r ({emo}): {pr:.4f}")

    # Calcular métricas promedio a lo largo de las épocas
    avg_metrics = {}
    for key in metrics_per_epoch[0].keys():
        if key != 'pearson_r':
            avg_metrics[key] = np.mean([epoch_metrics[key] for epoch_metrics in metrics_per_epoch])
        else:
            avg_metrics[key] = {}
            for emotion in metrics_per_epoch[0]['pearson_r'].keys():
                avg_metrics[key][emotion] = np.mean([epoch_metrics['pearson_r'][emotion] for epoch_metrics in metrics_per_epoch])

    print("\nFinal Average Metrics Across Epochs:")
    print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Precision (Micro): {avg_metrics['precision_micro']:.4f}, Precision (Macro): {avg_metrics['precision_macro']:.4f}")
    print(f"Recall (Micro): {avg_metrics['recall_micro']:.4f}, Recall (Macro): {avg_metrics['recall_macro']:.4f}")
    print(f"F1 (Micro): {avg_metrics['f1_micro']:.4f}, F1 (Macro): {avg_metrics['f1_macro']:.4f}")
    for emo, pr in avg_metrics['pearson_r'].items():
        print(f"Pearson r ({emo}): {pr:.4f}")

    return metrics_per_epoch, avg_metrics

# Ejecutar el entrenamiento y la evaluación
metrics_per_epoch, avg_metrics = train_and_evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 277/277 [14:08<00:00,  3.06s/it]


Average training loss: 0.6157


Validation: 100%|██████████| 70/70 [01:04<00:00,  1.08it/s]


Accuracy: 0.6498
Precision (Micro): 0.6498, Precision (Macro): 0.6571
Recall (Micro): 0.6498, Recall (Macro): 0.6587
F1 (Micro): 0.6498, F1 (Macro): 0.6497
Pearson r (V): 0.5287
Pearson r (A): 0.5386
Pearson r (D): 0.6569

Epoch 2/3


Training: 100%|██████████| 277/277 [14:17<00:00,  3.10s/it]


Average training loss: 0.5822


Validation: 100%|██████████| 70/70 [01:04<00:00,  1.08it/s]


Accuracy: 0.7455
Precision (Micro): 0.7455, Precision (Macro): 0.7476
Recall (Micro): 0.7455, Recall (Macro): 0.7519
F1 (Micro): 0.7455, F1 (Macro): 0.7448
Pearson r (V): 0.5405
Pearson r (A): 0.5868
Pearson r (D): 0.6818

Epoch 3/3


Training: 100%|██████████| 277/277 [14:08<00:00,  3.06s/it]


Average training loss: 0.5663


Validation: 100%|██████████| 70/70 [01:04<00:00,  1.08it/s]

Accuracy: 0.7437
Precision (Micro): 0.7437, Precision (Macro): 0.7494
Recall (Micro): 0.7437, Recall (Macro): 0.7528
F1 (Micro): 0.7437, F1 (Macro): 0.7434
Pearson r (V): 0.5489
Pearson r (A): 0.5902
Pearson r (D): 0.6759

Final Average Metrics Across Epochs:
Accuracy: 0.7130
Precision (Micro): 0.7130, Precision (Macro): 0.7181
Recall (Micro): 0.7130, Recall (Macro): 0.7211
F1 (Micro): 0.7130, F1 (Macro): 0.7126
Pearson r (V): 0.5394
Pearson r (A): 0.5719
Pearson r (D): 0.6715





In [7]:
#save the model
from transformers import BertForSequenceClassification, BertTokenizer

# Directorio donde se guardará el modelo
output_dir = "./modelo_entrenado_completo"

# Crear el directorio si no existe
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Guardar el modelo
model.save_pretrained(output_dir)

# Guardar el tokenizador
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizador guardados en {output_dir}")

Modelo y tokenizador guardados en ./modelo_entrenado_completo


In [8]:
torch.save(model.state_dict(), 'modelo_entrenado.pth')
print("Modelo guardado exitosamente.")


Modelo guardado exitosamente.


In [9]:
#
def validate_model():
    model.eval()  # Cambiar el modelo al modo de evaluación
    eval_predictions = []
    eval_labels = []

    # Proceso de validación
    for batch in tqdm(validation_dataloader, desc="Validating"):
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits

        eval_predictions.extend(logits.detach().cpu().numpy())
        eval_labels.extend(b_labels.detach().cpu().numpy())

    # Calcular métricas
    metrics = evaluate_metrics(np.array(eval_predictions), np.array(eval_labels))
    
    print("\nValidation Metrics:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision (Micro): {metrics['precision_micro']:.4f}, Precision (Macro): {metrics['precision_macro']:.4f}")
    print(f"Recall (Micro): {metrics['recall_micro']:.4f}, Recall (Macro): {metrics['recall_macro']:.4f}")
    print(f"F1 (Micro): {metrics['f1_micro']:.4f}, F1 (Macro): {metrics['f1_macro']:.4f}")

    return metrics

# Ejecutar la validación
validation_metrics = validate_model()

Validating: 100%|██████████| 70/70 [01:04<00:00,  1.08it/s]


Validation Metrics:
Accuracy: 0.7437
Precision (Micro): 0.7437, Precision (Macro): 0.7494
Recall (Micro): 0.7437, Recall (Macro): 0.7528
F1 (Micro): 0.7437, F1 (Macro): 0.7434





In [10]:

# Cargar los datos del archivo CSV
df1 = pd.read_csv("eng_dev.csv")
df1

Unnamed: 0,id,text
0,eng_dev_track2_001,"I have a floor shift in the morning, hopefully..."
1,eng_dev_track2_002,What is it about this winter that is making me...
2,eng_dev_track2_003,"Longest, most awkward drive I've ever taken."
3,eng_dev_track2_004,"I know not why, I wipe my face."
4,eng_dev_track2_005,"And I laughed like this: garhahagar, because m..."
...,...,...
111,eng_dev_track2_112,My heart sank.
112,eng_dev_track2_113,I remember the sweat burning in my eyes and tr...
113,eng_dev_track2_114,My sister was walking backwards and bumped her...
114,eng_dev_track2_115,"I can't breathe right, my head has been stuffe..."


In [11]:
#DATA TO VALIDATE DEV (revisar)
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

# Cargar los datos del archivo CSV
df1 = pd.read_csv("eng_dev.csv")
# Clase TextProcessing ya definida anteriormente
class TextProcessing(object):
    name = 'Text Processing'
    lang = 'en'

    def __init__(self, lang: str = 'en'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def expand_contractions(text: str) -> str:
        """Expande las contracciones en el texto."""
        try:
            return contractions.fix(text)
        except Exception as e:
            print('Error expand_contractions: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = TextProcessing.expand_contractions(text_out)  # Expandimos las contracciones
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))

# Función para aplicar el preprocesamiento al DataFrame
def apply_preprocessing_to_df1(df1: pd.DataFrame, column: str) -> pd.DataFrame:
    tp = TextProcessing(lang='en')  # Inicializamos el preprocesador en inglés
    df1[f'{column}_processed1'] = df1[column].apply(lambda x: tp.transformer(x))
    return df1
df1 = apply_preprocessing_to_df1(df1, "text")
print(df1.head()) 


                   id                                               text  \
0  eng_dev_track2_001  I have a floor shift in the morning, hopefully...   
1  eng_dev_track2_002  What is it about this winter that is making me...   
2  eng_dev_track2_003       Longest, most awkward drive I've ever taken.   
3  eng_dev_track2_004                    I know not why, I wipe my face.   
4  eng_dev_track2_005  And I laughed like this: garhahagar, because m...   

                                     text_processed1  
0  i have a floor shift in the morning hopefully ...  
1  what is it about this winter that is making me...  
2      longest most awkward drive i have ever taken.  
3                     i know not why i wipe my face.  
4  and i laughed like this garhahagar because my ...  


In [12]:
# Predecir VAD usando el modelo entrenado
def predict_vad(df1, text_processed1):
    model.eval()  # Cambiar el modelo a modo evaluación
    input_ids, attention_masks = BERT_tokenization(df1, text_processed1)  # Tokenización de los textos

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    predictions = []

    for batch in tqdm(dataloader, desc="Predicting"):
        b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            predictions.extend(logits.detach().cpu().numpy())

    return np.array(predictions)

# Obtener predicciones
predictions = predict_vad(df1, "text")

# Crear un DataFrame con las predicciones de VAD
pred_df1 = df1.copy()
pred_df1[["V_pred", "A_pred", "D_pred"]] = predictions

# Decodificar las predicciones de VAD a intensidades emocionales
emotion_labels = list(vad_values2.keys())
vad_matrix = np.array([[vad["V"], vad["A"], vad["D"]] for vad in vad_values2.values()])

def decode_vad_to_intensities(v, a, d):
    input_vad = np.array([v, a, d])
    distances = np.linalg.norm(vad_matrix - input_vad, axis=1)

    # Invertir las distancias para calcular intensidades proporcionales
    max_distance = np.max(distances)
    inverted_distances = max_distance - distances
    intensities = (inverted_distances / inverted_distances.sum()) * 3  # Escalar a 0-3
    intensities = np.round(intensities).astype(int)  # Redondear a enteros
    return pd.Series(intensities, index=emotion_labels)

# Aplicar la decodificación a las predicciones
decoded_intensities = pred_df1.apply(
    lambda row: decode_vad_to_intensities(row["V_pred"], row["A_pred"], row["D_pred"]),
    axis=1
)

# Añadir las intensidades decodificadas al DataFrame
for emotion in emotion_labels:
    pred_df1[emotion] = decoded_intensities[emotion]

# Guardar el DataFrame con las predicciones y las intensidades decodificadas
pred_df1.to_csv("predicciones_decodificadas.csv", index=False)
print("Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.")

# Visualizar las primeras filas del DataFrame resultante
pred_df1

Predicting: 100%|██████████| 15/15 [00:13<00:00,  1.10it/s]

Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.





Unnamed: 0,id,text,text_processed1,V_pred,A_pred,D_pred,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track2_001,"I have a floor shift in the morning, hopefully...",i have a floor shift in the morning hopefully ...,1.090966,0.930460,-0.251571,0,1,0,1,1
1,eng_dev_track2_002,What is it about this winter that is making me...,what is it about this winter that is making me...,0.451516,0.554396,-1.276996,0,1,0,1,0
2,eng_dev_track2_003,"Longest, most awkward drive I've ever taken.",longest most awkward drive i have ever taken.,0.565344,0.229077,-1.043907,0,1,0,1,0
3,eng_dev_track2_004,"I know not why, I wipe my face.",i know not why i wipe my face.,0.193684,0.084650,-0.915383,0,1,0,1,0
4,eng_dev_track2_005,"And I laughed like this: garhahagar, because m...",and i laughed like this garhahagar because my ...,1.556907,1.718913,0.196447,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
111,eng_dev_track2_112,My heart sank.,my heart sank.,0.292723,-0.482987,-1.416687,0,1,0,2,0
112,eng_dev_track2_113,I remember the sweat burning in my eyes and tr...,i remember the sweat burning in my eyes and tr...,0.845790,0.871266,-1.125727,0,1,0,1,1
113,eng_dev_track2_114,My sister was walking backwards and bumped her...,my sister was walking backwards and bumped her...,0.638080,1.323399,-0.492238,0,1,0,1,1
114,eng_dev_track2_115,"I can't breathe right, my head has been stuffe...",i cannot breathe right my head has been stuffe...,0.466825,0.062405,-1.392075,0,1,0,1,0


In [13]:
# Lista de columnas a excluir
columns_to_exclude = ['V_pred', 'A_pred', 'D_pred','text']  # Reemplaza con los nombres de las columnas que deseas excluir

# Crear un nuevo DataFrame sin las columnas especificadas
pred_df_filtered = pred_df1.drop(columns=columns_to_exclude)

# Verificar el resultado
print(pred_df_filtered.head())

                   id                                    text_processed1  \
0  eng_dev_track2_001  i have a floor shift in the morning hopefully ...   
1  eng_dev_track2_002  what is it about this winter that is making me...   
2  eng_dev_track2_003      longest most awkward drive i have ever taken.   
3  eng_dev_track2_004                     i know not why i wipe my face.   
4  eng_dev_track2_005  and i laughed like this garhahagar because my ...   

   Anger  Fear  Joy  Sadness  Surprise  
0      0     1    0        1         1  
1      0     1    0        1         0  
2      0     1    0        1         0  
3      0     1    0        1         0  
4      0     1    1        0         1  


In [14]:
# Reorganizar las columnas
columns_order = ['id','Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_df_filtered = pred_df_filtered[columns_order]

# Guardar el DataFrame reorganizado en un nuevo archivo CSV
pred_df_filtered.to_csv("pred_eng_b.csv", index=False)

# Confirmar que el archivo ha sido guardado
print("El archivo 'pred_eng.csv' se ha guardado con las columnas reorganizadas.")


El archivo 'pred_eng.csv' se ha guardado con las columnas reorganizadas.


In [15]:
#test data
import pandas as pd

# Cargar el archivo CSV
df3 = pd.read_csv("eng.csv")

# Verificar la estructura del archivo
df3

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_test_track_b_00001,/ o \ So today I went in for a new exam with D...,,,,,
1,eng_test_track_b_00002,The image I have in my mind is this: a group o...,,,,,
2,eng_test_track_b_00003,"I slammed my fist against the door and yelled,...",,,,,
3,eng_test_track_b_00004,I could not unbend my knees.,,,,,
4,eng_test_track_b_00005,"I spent the night at the hotel, mostly hanging...",,,,,
...,...,...,...,...,...,...,...
2762,eng_test_track_b_02763,Better late then never!,,,,,
2763,eng_test_track_b_02764,"In the last three weeks, I have started lookin...",,,,,
2764,eng_test_track_b_02765,"But I never fell out, so it wasn't a problem.",,,,,
2765,eng_test_track_b_02766,""" So I will remain positive for as long as I l...",,,,,


In [16]:

# Clase TextProcessing ya definida anteriormente
class TextProcessing(object):
    name = 'Text Processing'
    lang = 'en'

    def __init__(self, lang: str = 'en'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def expand_contractions(text: str) -> str:
        """Expande las contracciones en el texto."""
        try:
            return contractions.fix(text)
        except Exception as e:
            print('Error expand_contractions: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = TextProcessing.expand_contractions(text_out)  # Expandimos las contracciones
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))

# Función para aplicar el preprocesamiento al DataFrame
def apply_preprocessing_to_df3(df3: pd.DataFrame, column: str) -> pd.DataFrame:
    tp = TextProcessing(lang='en')  # Inicializamos el preprocesador en inglés
    df3[f'{column}_processed3'] = df3[column].apply(lambda x: tp.transformer(x))
    return df3
df3 = apply_preprocessing_to_df3(df3, "text")
print(df3.head()) 

                       id                                               text  \
0  eng_test_track_b_00001  / o \ So today I went in for a new exam with D...   
1  eng_test_track_b_00002  The image I have in my mind is this: a group o...   
2  eng_test_track_b_00003  I slammed my fist against the door and yelled,...   
3  eng_test_track_b_00004                       I could not unbend my knees.   
4  eng_test_track_b_00005  I spent the night at the hotel, mostly hanging...   

   anger  fear  joy  sadness  surprise  \
0    NaN   NaN  NaN      NaN       NaN   
1    NaN   NaN  NaN      NaN       NaN   
2    NaN   NaN  NaN      NaN       NaN   
3    NaN   NaN  NaN      NaN       NaN   
4    NaN   NaN  NaN      NaN       NaN   

                                     text_processed3  
0  o \ so today i went in for a new exam with dr....  
1  the image i have in my mind is this a group of...  
2  i slammed my fist against the door and yelled ...  
3                       i could not unbend my 

In [17]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm


# Predecir VAD usando el modelo entrenado
def predict_vad(df3,text_processed3):
    model.eval()  # Cambiar el modelo a modo evaluación
    input_ids, attention_masks = BERT_tokenization(df3, text_processed3)  # Tokenización de los textos

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    predictions = []

    for batch in tqdm(dataloader, desc="Predicting"):
        b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            predictions.extend(logits.detach().cpu().numpy())

    return np.array(predictions)

# Obtener predicciones
predictions = predict_vad(df3, "text_processed3")

# Crear un DataFrame con las predicciones de VAD
pred_df3 = df3.copy()
pred_df3[["V_pred", "A_pred", "D_pred"]] = predictions

# Decodificar las predicciones de VAD a intensidades emocionales
emotion_labels = list(vad_values2.keys())
vad_matrix = np.array([[vad["V"], vad["A"], vad["D"]] for vad in vad_values2.values()])

def decode_vad_to_intensities(v, a, d):
    input_vad = np.array([v, a, d])
    distances = np.linalg.norm(vad_matrix - input_vad, axis=1)

    # Invertir las distancias para calcular intensidades proporcionales
    max_distance = np.max(distances)
    inverted_distances = max_distance - distances
    intensities = (inverted_distances / inverted_distances.sum()) * 3  # Escalar a 0-3
    intensities = np.round(intensities).astype(int)  # Redondear a enteros
    return pd.Series(intensities, index=emotion_labels)

# Aplicar la decodificación a las predicciones
decoded_intensities = pred_df3.apply(
    lambda row: decode_vad_to_intensities(row["V_pred"], row["A_pred"], row["D_pred"]),
    axis=1
)

# Añadir las intensidades decodificadas al DataFrame
for emotion in emotion_labels:
    pred_df3[emotion] = decoded_intensities[emotion]

# Guardar el DataFrame con las predicciones y las intensidades decodificadas
pred_df3.to_csv("predicciones_decodificadas.csv", index=False)
print("Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.")

# Visualizar las primeras filas del DataFrame resultante
pred_df3

Predicting: 100%|██████████| 346/346 [05:23<00:00,  1.07it/s]


Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.


Unnamed: 0,id,text,anger,fear,joy,sadness,surprise,text_processed3,V_pred,A_pred,D_pred,Anger,Fear,Joy,Sadness,Surprise
0,eng_test_track_b_00001,/ o \ So today I went in for a new exam with D...,,,,,,o \ so today i went in for a new exam with dr....,0.311887,0.511399,-0.875982,0,1,0,1,0
1,eng_test_track_b_00002,The image I have in my mind is this: a group o...,,,,,,the image i have in my mind is this a group of...,0.611419,0.864274,-1.139573,0,1,0,1,1
2,eng_test_track_b_00003,"I slammed my fist against the door and yelled,...",,,,,,i slammed my fist against the door and yelled ...,0.035407,1.324540,0.075120,1,1,0,0,1
3,eng_test_track_b_00004,I could not unbend my knees.,,,,,,i could not unbend my knees.,0.639134,0.798000,-0.949776,0,1,0,1,1
4,eng_test_track_b_00005,"I spent the night at the hotel, mostly hanging...",,,,,,i spent the night at the hotel mostly hanging ...,0.720385,0.968216,-0.572761,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2762,eng_test_track_b_02763,Better late then never!,,,,,,better late then never,0.384021,0.913803,-0.249709,1,1,0,1,1
2763,eng_test_track_b_02764,"In the last three weeks, I have started lookin...",,,,,,in the last three weeks i have started looking...,2.721167,1.749632,0.911263,0,1,1,0,1
2764,eng_test_track_b_02765,"But I never fell out, so it wasn't a problem.",,,,,,but i never fell out so it was not a problem.,1.394246,1.041869,0.670498,0,1,1,0,1
2765,eng_test_track_b_02766,""" So I will remain positive for as long as I l...",,,,,,so i will remain positive for as long as i live.,1.366620,0.746311,-0.124205,0,1,1,1,1


In [18]:
import pandas as pd

# Cargar el archivo CSV con opciones para manejar comillas y delimitadores
df4 = pd.read_csv(
    "eng2_train.csv",
    delimiter=",",  # Asegura que el delimitador sea la coma
    quotechar='"',  # Manejo adecuado de comillas dobles
    encoding="utf-8",  # Usa UTF-8 para evitar problemas de caracteres especiales
    skip_blank_lines=True,  # Ignora líneas en blanco
    on_bad_lines="skip"  # Evita errores por líneas mal formateadas
)

# Verificar si hay valores NaN
df4.dropna(inplace=True)  # Elimina filas con valores NaN generados por errores de lectura

# Mostrar las primeras filas para verificar la estructura
print(df4.head())

                      id                                               text  \
0   eng_train_track2_001       None of us has mentioned the incident since.   
1   eng_train_track2_015                       So... for reasons unknown...   
8   eng_train_track2_022  Later when we got home she saw all her present...   
9   eng_train_track2_023                              Was I drunk or a kid?   
10  eng_train_track2_024  I farted and a little nugget fell out onto the...   

    Joy  Fear  Anger  Sadness  Surprise  
0   0.0   1.0    0.0      2.0       1.0  
1   0.0   1.0    0.0      0.0       2.0  
8   2.0   0.0    0.0      0.0       1.0  
9   0.0   2.0    0.0      0.0       2.0  
10  1.0   0.0    0.0      0.0       2.0  


In [19]:
import spacy
import re
import nltk
import unicodedata
import requests
from spacy_syllables import SpacySyllables
from bs4 import BeautifulSoup
from nltk import TweetTokenizer
from spacy.lang.es import Spanish
from spacy.lang.en import English
from nltk.util import ngrams
import pandas as pd
import contractions  # Importamos la librería para expandir contracciones
import unicodedata


# Clase TextProcessing ya definida anteriormente
class TextProcessing(object):
    name = 'Text Processing'
    lang = 'en'

    def __init__(self, lang: str = 'en'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def expand_contractions(text: str) -> str:
        """Expande las contracciones en el texto."""
        try:
            return contractions.fix(text)
        except Exception as e:
            print('Error expand_contractions: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = TextProcessing.expand_contractions(text_out)  # Expandimos las contracciones
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))


# Función para aplicar el preprocesamiento al DataFrame
def apply_preprocessing_to_df(df4: pd.DataFrame, column: str) -> pd.DataFrame:
    tp = TextProcessing(lang='en')  # Inicializamos el preprocesador en inglés
    df4[f'{column}_processed'] = df4[column].apply(lambda x: tp.transformer(x))
    return df4
df4 = apply_preprocessing_to_df(df4, "text")
print(df4.head()) 

                      id                                               text  \
0   eng_train_track2_001       None of us has mentioned the incident since.   
1   eng_train_track2_015                       So... for reasons unknown...   
8   eng_train_track2_022  Later when we got home she saw all her present...   
9   eng_train_track2_023                              Was I drunk or a kid?   
10  eng_train_track2_024  I farted and a little nugget fell out onto the...   

    Joy  Fear  Anger  Sadness  Surprise  \
0   0.0   1.0    0.0      2.0       1.0   
1   0.0   1.0    0.0      0.0       2.0   
8   2.0   0.0    0.0      0.0       1.0   
9   0.0   2.0    0.0      0.0       2.0   
10  1.0   0.0    0.0      0.0       2.0   

                                       text_processed  
0        none of us has mentioned the incident since.  
1                        so... for reasons unknown...  
8   later when we got home she saw all her present...  
9                                was i dru

In [20]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm


# Predecir VAD usando el modelo entrenado
def predict_vad(df4,text_processed):
    model.eval()  # Cambiar el modelo a modo evaluación
    input_ids, attention_masks = BERT_tokenization(df4, text_processed)  # Tokenización de los textos

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    predictions = []

    for batch in tqdm(dataloader, desc="Predicting"):
        b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            predictions.extend(logits.detach().cpu().numpy())

    return np.array(predictions)
# Obtener predicciones
predictions = predict_vad(df4, "text_processed")

# Crear un DataFrame con las predicciones de VAD
pred_df4 = df4.copy()
pred_df4[["V_pred", "A_pred", "D_pred"]] = predictions

# Decodificar las predicciones de VAD a intensidades emocionales
emotion_labels = list(vad_values2.keys())
vad_matrix = np.array([[vad["V"], vad["A"], vad["D"]] for vad in vad_values2.values()])

def decode_vad_to_intensities(v, a, d):
    input_vad = np.array([v, a, d])
    distances = np.linalg.norm(vad_matrix - input_vad, axis=1)

    # Invertir las distancias para calcular intensidades proporcionales
    max_distance = np.max(distances)
    inverted_distances = max_distance - distances
    intensities = (inverted_distances / inverted_distances.sum()) * 3  # Escalar a 0-3
    intensities = np.round(intensities).astype(int)  # Redondear a enteros
    return pd.Series(intensities, index=emotion_labels)

# Aplicar la decodificación a las predicciones
decoded_intensities = pred_df4.apply(
    lambda row: decode_vad_to_intensities(row["V_pred"], row["A_pred"], row["D_pred"]),
    axis=1
)

# Añadir las intensidades decodificadas al DataFrame
for emotion in emotion_labels:
    pred_df4[emotion] = decoded_intensities[emotion]

# Guardar el DataFrame con las predicciones y las intensidades decodificadas
pred_df4.to_csv("predicciones_decodificadas.csv", index=False)
print("Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.")

# Visualizar las primeras filas del DataFrame resultante
pred_df4

Predicting: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]

Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.





Unnamed: 0,id,text,Joy,Fear,Anger,Sadness,Surprise,text_processed,V_pred,A_pred,D_pred
0,eng_train_track2_001,None of us has mentioned the incident since.,0,1,0,2,0,none of us has mentioned the incident since.,0.399733,-0.068715,-0.896017
1,eng_train_track2_015,So... for reasons unknown...,0,2,0,0,1,so... for reasons unknown...,0.876226,1.36863,-0.435247
8,eng_train_track2_022,Later when we got home she saw all her present...,1,1,0,0,1,later when we got home she saw all her present...,2.949194,1.530099,1.310039
9,eng_train_track2_023,Was I drunk or a kid?,1,1,0,0,1,was i drunk or a kid,1.122741,1.695458,-0.012874
10,eng_train_track2_024,I farted and a little nugget fell out onto the...,1,1,0,0,1,i farted and a little nugget fell out onto the...,2.81253,1.659924,1.206395
12,eng_train_track2_026,Never found anything... kind of spooky.,1,1,0,0,1,never found anything... kind of spooky.,1.487133,1.454574,-0.584707
14,eng_train_track2_028,We cuddled on the couch whilst watching TV.,1,1,0,0,1,we cuddled on the couch whilst watching tv.,2.639673,1.159558,1.436588
16,eng_train_track2_2768,I stopped a couple times to stretch out my cal...,1,1,0,0,1,i stopped a couple times to stretch out my cal...,1.895036,1.227654,0.12675
