In [1]:
import pandas as pd

# Cargar el archivo CSV
df = pd.read_csv("eng_train.csv")

# Verificar la estructura del archivo
df

Unnamed: 0,id,text,Joy,Fear,Anger,Sadness,Surprise
0,eng_train_track2_001,None of us has mentioned the incident since.,0,1,0,2,1
1,eng_train_track2_002,"I was 7 and woke up early, so I went to the ba...",1,0,0,0,0
2,eng_train_track2_003,By that point I felt like someone was stabbing...,0,3,0,0,0
3,eng_train_track2_004,watching her leave with dudes drove me crazy.,0,1,3,1,0
4,eng_train_track2_005,`` My eyes widened.,0,1,0,0,2
...,...,...,...,...,...,...,...
2763,eng_train_track2_2764,"My face is cold, and my hands are guilty.",0,1,0,1,0
2764,eng_train_track2_2765,I remembered how I dragged his box into the be...,1,0,0,0,0
2765,eng_train_track2_2766,As I walked in the door she came around the co...,3,0,0,0,1
2766,eng_train_track2_2767,They kept me at the hospital for 24 hours-and ...,0,1,0,1,0


In [2]:
import spacy
import re
import nltk
import unicodedata
import requests
from spacy_syllables import SpacySyllables
from bs4 import BeautifulSoup
from nltk import TweetTokenizer
from spacy.lang.es import Spanish
from spacy.lang.en import English
from nltk.util import ngrams
import pandas as pd
import contractions  # Importamos la librería para expandir contracciones
import unicodedata



#df = pd.DataFrame(df_final)

# Clase TextProcessing ya definida anteriormente
class TextProcessing(object):
    name = 'Text Processing'
    lang = 'en'

    def __init__(self, lang: str = 'en'):
        self.lang = lang

    @staticmethod
    def nlp(text: str) -> list:
        try:
            list_tagger = []
            tp_nlp = TextProcessing.load_spacy(TextProcessing.lang)
            doc = tp_nlp(text.lower())
            for token in doc:
                item = {'text': token.text, 'lemma': token.lemma_, 'pos': token.pos_, 'tag': token.tag_,
                        'dep': token.dep_, 'shape': token.shape_, 'is_alpha': token.is_alpha,
                        'is_stop': token.is_stop, 'is_digit': token.is_digit, 'is_punct': token.is_punct,
                        'syllables': token._.syllables}
                list_tagger.append(item)
            return list_tagger
        except Exception as e:
            print('Error nlp: {0}'.format(e))

    @staticmethod
    def load_spacy(lang: str) -> object:
        try:
            spacy_model = {'es': 'es_core_news_sm', 'en': 'en_core_web_sm'}
            if not spacy.util.is_package(spacy_model[lang]):
                spacy.cli.download(spacy_model[lang])

            component = spacy.load(spacy_model[lang])
            SpacySyllables(component)
            component.add_pipe('syllables', last=True)
            return component
        except Exception as e:
            print('Error load spacy: {0}'.format(e))

    @staticmethod
    def proper_encoding(text: str) -> str:
        try:
            text = unicodedata.normalize('NFD', text)
            text = text.encode('ascii', 'ignore')
            return text.decode("utf-8")
        except Exception as e:
            print('Error proper_encoding: {0}'.format(e))

    @staticmethod
    def stopwords(text: str) -> str:
        try:
            nlp = English()
            doc = nlp(text)
            token_list = [token.text for token in doc]
            sentence = []
            for word in token_list:
                lexeme = nlp.vocab[word]
                if not lexeme.is_stop:
                    sentence.append(word)
            return ' '.join(sentence)
        except Exception as e:
            print('Error stopwords: {0}'.format(e))

    @staticmethod
    def remove_patterns(text: str) -> str:
        try:
            text = re.sub(r'\©|\×|\⇔|\_|\»|\«|\~|\#|\$|\€|\Â|\�|\¬', '', text)
            text = re.sub(r'\,|\;|\:|\!|\¡|\’|\‘|\”|\“|\"|\'|\`', '', text)
            text = re.sub(r'\}|\{|\[|\]|\(|\)|\<|\>|\?|\¿|\°|\|', '', text)
            text = re.sub(r'\/|\-|\+|\*|\=|\^|\%|\&|\$', '', text)
            text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
            return text.lower()
        except Exception as e:
            print('Error remove_patterns: {0}'.format(e))

    @staticmethod
    def expand_contractions(text: str) -> str:
        """Expande las contracciones en el texto."""
        try:
            return contractions.fix(text)
        except Exception as e:
            print('Error expand_contractions: {0}'.format(e))

    @staticmethod
    def transformer(text: str, stopwords: bool = False) -> str:
        try:
            text_out = TextProcessing.proper_encoding(text)
            text_out = TextProcessing.expand_contractions(text_out)  # Expandimos las contracciones
            text_out = text_out.lower()
            text_out = re.sub("[\U0001f000-\U000e007f]", '[EMOJI]', text_out)
            text_out = re.sub(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+'
                r'|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
                '[URL]', text_out)
            text_out = re.sub("@", '[MENTION]', text_out)
            text_out = re.sub("#([A-Za-z0-9_]{1,40})", '[HASTAG]', text_out)
            text_out = TextProcessing.remove_patterns(text_out)
            text_out = TextProcessing.stopwords(text_out) if stopwords else text_out
            text_out = re.sub(r'\s+', ' ', text_out).strip()
            text_out = text_out.rstrip()
            return text_out if text_out != ' ' else None
        except Exception as e:
            print('Error transformer: {0}'.format(e))


# Función para aplicar el preprocesamiento al DataFrame
def apply_preprocessing_to_df(df: pd.DataFrame, column: str) -> pd.DataFrame:
    tp = TextProcessing(lang='en')  # Inicializamos el preprocesador en inglés
    df[f'{column}_processed'] = df[column].apply(lambda x: tp.transformer(x))
    return df



In [3]:

vad_values2 = {
    "Anger": {"V": 0.167, "A": 0.865, "D": 0.657},
    "Fear": {"V": 0.73, "A": 0.840, "D": 0.293},
    "Joy": {"V": 0.980, "A": 0.824, "D": 0.794},
    "Sadness": {"V": 0.52, "A": 0.288, "D": 0.164},
    "Surprise": {"V": 0.875, "A": 0.875, "D": 0.562},
}


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split



# Función para calcular V, A y D
def calculate_vad(row):
    emotions = ["Anger", "Fear", "Joy", "Sadness", "Surprise"]
    total_intensity = sum(row[emotion] for emotion in emotions)
    if total_intensity == 0:
        return 0.500, 0.500, 0.500  # Valores para "No Emotion"

    V = sum(row[emotion] * vad_values2[emotion]["V"] for emotion in emotions) / total_intensity
    A = sum(row[emotion] * vad_values2[emotion]["A"] for emotion in emotions) / total_intensity
    D = sum(row[emotion] * vad_values2[emotion]["D"] for emotion in emotions) / total_intensity
    return round(V, 3), round(A, 3), round(D, 3)

# Aplicar la función para calcular V, A y D
df[["V", "A", "D"]] = df.apply(calculate_vad, axis=1, result_type="expand")

# Preprocesar texto
tp = TextProcessing(lang='en')
df["text_processed"] = df["text"].apply(lambda x: tp.transformer(x))

# Dividir los datos en entrenamiento y validación (80-20)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Mostrar resultados
print(train_df.head())
print(val_df.head())


                         id  \
2124  eng_train_track2_2125   
2716  eng_train_track2_2717   
2232  eng_train_track2_2233   
261    eng_train_track2_262   
2059  eng_train_track2_2060   

                                                   text  Joy  Fear  Anger  \
2124  26 January 2011 @ 04:45 pm Boys & Girls 718 Cr...    0     2      0   
2716  We headed north on a sunny Saturday morning in...    1     0      0   
2232  I looked down to find five small white arrows ...    0     3      0   
261                         I've never gone back there.    0     2      0   
2059                                     My heart sank.    0     0      0   

      Sadness  Surprise      V      A      D  \
2124        0         0  0.730  0.840  0.293   
2716        0         0  0.980  0.824  0.794   
2232        3         2  0.688  0.642  0.312   
261         2         0  0.625  0.564  0.228   
2059        2         0  0.520  0.288  0.164   

                                         text_processed  
212

In [5]:
train_df

Unnamed: 0,id,text,Joy,Fear,Anger,Sadness,Surprise,V,A,D,text_processed
2124,eng_train_track2_2125,26 January 2011 @ 04:45 pm Boys & Girls 718 Cr...,0,2,0,0,0,0.730,0.840,0.293,january mention pm boys girls crawled out of b...
2716,eng_train_track2_2717,We headed north on a sunny Saturday morning in...,1,0,0,0,0,0.980,0.824,0.794,we headed north on a sunny saturday morning in...
2232,eng_train_track2_2233,I looked down to find five small white arrows ...,0,3,0,3,2,0.688,0.642,0.312,i looked down to find five small white arrows ...
261,eng_train_track2_262,I've never gone back there.,0,2,0,2,0,0.625,0.564,0.228,i have never gone back there.
2059,eng_train_track2_2060,My heart sank.,0,0,0,2,0,0.520,0.288,0.164,my heart sank.
...,...,...,...,...,...,...,...,...,...,...,...
1638,eng_train_track2_1639,She cants her hip against my waist into my sid...,2,0,0,0,1,0.945,0.841,0.717,she cants her hip against my waist into my sid...
1095,eng_train_track2_1096,"I then did the dishes, whitened my teeth, watc...",0,0,0,0,0,0.500,0.500,0.500,i then did the dishes whitened my teeth watche...
1130,eng_train_track2_1131,It just kind of gradually vanished over a coup...,0,0,0,0,1,0.875,0.875,0.562,it just kind of gradually vanished over a coup...
1294,eng_train_track2_1295,I didn't look out of my hands.,0,1,0,0,0,0.730,0.840,0.293,i did not look out of my hands.


In [6]:

import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import unicodedata
from spacy_syllables import SpacySyllables
import spacy
from sklearn.model_selection import train_test_split
from torch.nn.functional import softmax
# Tokenización con BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def BERT_tokenization(df, text_column):
    input_ids = []
    attention_masks = []

    for sent in df[text_column]:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_input_ids, train_attention_masks = BERT_tokenization(train_df, "text_processed")
val_input_ids, val_attention_masks = BERT_tokenization(val_df, "text_processed")

# Convertir etiquetas
train_labels = torch.tensor(train_df[["V", "A", "D"]].values, dtype=torch.float32)
val_labels = torch.tensor(val_df[["V", "A", "D"]].values, dtype=torch.float32)

# Crear datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm  # Importamos tqdm para las barras de progreso

batch_size = 8

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Configurar el modelo BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,  # Tres etiquetas: V, A, D
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Configurar optimizador y scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 3
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_metrics(predictions, labels):
    pred_flat = np.argmax(predictions, axis=1)
    labels_flat = np.argmax(labels, axis=1)

    accuracy = accuracy_score(labels_flat, pred_flat)
    precision_micro = precision_score(labels_flat, pred_flat, average="micro")
    precision_macro = precision_score(labels_flat, pred_flat, average="macro")
    recall_micro = recall_score(labels_flat, pred_flat, average="micro")
    recall_macro = recall_score(labels_flat, pred_flat, average="macro")
    f1_micro = f1_score(labels_flat, pred_flat, average="micro")
    f1_macro = f1_score(labels_flat, pred_flat, average="macro")

    return {
        "accuracy": accuracy,
        "precision_micro": precision_micro,
        "precision_macro": precision_macro,
        "recall_micro": recall_micro,
        "recall_macro": recall_macro,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
    }

def train_and_evaluate():
    metrics_per_epoch = []

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        
        # Training
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_dataloader, desc="Training"):
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()

            outputs = model(b_input_ids, 
                            attention_mask=b_attention_mask, 
                            labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            lr_scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

         # Evaluation
        model.eval()
        eval_predictions = []
        eval_labels = []

        for batch in tqdm(validation_dataloader, desc="Validation"):
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                outputs = model(b_input_ids, 
                                attention_mask=b_attention_mask)
                logits = outputs.logits

            eval_predictions.extend(logits.detach().cpu().numpy())
            eval_labels.extend(b_labels.detach().cpu().numpy())

        metrics = evaluate_metrics(np.array(eval_predictions), np.array(eval_labels))
        metrics["train_loss"] = avg_train_loss
        metrics_per_epoch.append(metrics)
        

        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"Precision (Micro): {metrics['precision_micro']:.4f}, Precision (Macro): {metrics['precision_macro']:.4f}")
        print(f"Recall (Micro): {metrics['recall_micro']:.4f}, Recall (Macro): {metrics['recall_macro']:.4f}")
        print(f"F1 (Micro): {metrics['f1_micro']:.4f}, F1 (Macro): {metrics['f1_macro']:.4f}")

    # Calculate average metrics over all epochs
    avg_metrics = {key: np.mean([epoch[key] for epoch in metrics_per_epoch]) for key in metrics_per_epoch[0].keys()}

    print("\nFinal Average Metrics Across Epochs:")
    print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
    print(f"Precision (Micro): {avg_metrics['precision_micro']:.4f}, Precision (Macro): {avg_metrics['precision_macro']:.4f}")
    print(f"Recall (Micro): {avg_metrics['recall_micro']:.4f}, Recall (Macro): {avg_metrics['recall_macro']:.4f}")
    print(f"F1 (Micro): {avg_metrics['f1_micro']:.4f}, F1 (Macro): {avg_metrics['f1_macro']:.4f}")

    return metrics_per_epoch, avg_metrics

# Run the training and evaluation
metrics_per_epoch, avg_metrics = train_and_evaluate() 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 277/277 [14:15<00:00,  3.09s/it]


Average training loss: 0.6113


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [01:05<00:00,  1.06it/s]


Accuracy: 0.7166
Precision (Micro): 0.7166, Precision (Macro): 0.7153
Recall (Micro): 0.7166, Recall (Macro): 0.7009
F1 (Micro): 0.7166, F1 (Macro): 0.7034

Epoch 2/3


Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 277/277 [14:13<00:00,  3.08s/it]


Average training loss: 0.5792


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [01:05<00:00,  1.07it/s]


Accuracy: 0.7184
Precision (Micro): 0.7184, Precision (Macro): 0.7355
Recall (Micro): 0.7184, Recall (Macro): 0.7334
F1 (Micro): 0.7184, F1 (Macro): 0.7184

Epoch 3/3


Training: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 277/277 [14:14<00:00,  3.09s/it]


Average training loss: 0.5644


Validation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [01:05<00:00,  1.07it/s]

Accuracy: 0.7581
Precision (Micro): 0.7581, Precision (Macro): 0.7579
Recall (Micro): 0.7581, Recall (Macro): 0.7626
F1 (Micro): 0.7581, F1 (Macro): 0.7570

Final Average Metrics Across Epochs:
Accuracy: 0.7310
Precision (Micro): 0.7310, Precision (Macro): 0.7362
Recall (Micro): 0.7310, Recall (Macro): 0.7323
F1 (Micro): 0.7310, F1 (Macro): 0.7262





In [None]:
anger Pearson r: 0.4875
fear Pearson r: 0.1348
joy Pearson r: 0.4713
sadness Pearson r: 0.5394
surprise Pearson r: 0.197

Average Pearson r: 0.366

In [8]:
from transformers import BertForSequenceClassification, BertTokenizer

# Directorio donde se guardará el modelo
output_dir = "./modelo_entrenado_completo"

# Crear el directorio si no existe
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Guardar el modelo
model.save_pretrained(output_dir)

# Guardar el tokenizador
tokenizer.save_pretrained(output_dir)

print(f"Modelo y tokenizador guardados en {output_dir}")


Modelo y tokenizador guardados en ./modelo_entrenado_completo


In [9]:
#DATOS DE VALIDACIÓN 

In [10]:
# Guardar el modelo entrenado
torch.save(model.state_dict(), 'modelo_entrenado.pth')
print("Modelo guardado exitosamente.")


Modelo guardado exitosamente.


In [11]:
def validate_model():
    model.eval()  # Cambiar el modelo al modo de evaluación
    eval_predictions = []
    eval_labels = []

    # Proceso de validación
    for batch in tqdm(validation_dataloader, desc="Validating"):
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits

        eval_predictions.extend(logits.detach().cpu().numpy())
        eval_labels.extend(b_labels.detach().cpu().numpy())

    # Calcular métricas
    metrics = evaluate_metrics(np.array(eval_predictions), np.array(eval_labels))
    
    print("\nValidation Metrics:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision (Micro): {metrics['precision_micro']:.4f}, Precision (Macro): {metrics['precision_macro']:.4f}")
    print(f"Recall (Micro): {metrics['recall_micro']:.4f}, Recall (Macro): {metrics['recall_macro']:.4f}")
    print(f"F1 (Micro): {metrics['f1_micro']:.4f}, F1 (Macro): {metrics['f1_macro']:.4f}")

    return metrics

# Ejecutar la validación
validation_metrics = validate_model()


Validating: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70/70 [01:05<00:00,  1.07it/s]


Validation Metrics:
Accuracy: 0.7581
Precision (Micro): 0.7581, Precision (Macro): 0.7579
Recall (Micro): 0.7581, Recall (Macro): 0.7626
F1 (Micro): 0.7581, F1 (Macro): 0.7570





In [12]:
#DATOS PARA VALIDAR DEV

In [13]:
import pandas as pd

# Cargar el archivo CSV
df1 = pd.read_csv("eng_dev.csv")

# Verificar la estructura del archivo
df1

Unnamed: 0,id,text
0,eng_dev_track2_001,"I have a floor shift in the morning, hopefully..."
1,eng_dev_track2_002,What is it about this winter that is making me...
2,eng_dev_track2_003,"Longest, most awkward drive I've ever taken."
3,eng_dev_track2_004,"I know not why, I wipe my face."
4,eng_dev_track2_005,"And I laughed like this: garhahagar, because m..."
...,...,...
111,eng_dev_track2_112,My heart sank.
112,eng_dev_track2_113,I remember the sweat burning in my eyes and tr...
113,eng_dev_track2_114,My sister was walking backwards and bumped her...
114,eng_dev_track2_115,"I can't breathe right, my head has been stuffe..."


In [14]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

# Cargar los datos del archivo CSV
df1 = pd.read_csv("eng_dev.csv")

# Predecir VAD usando el modelo entrenado
def predict_vad(df, text_column):
    model.eval()  # Cambiar el modelo a modo evaluación
    input_ids, attention_masks = BERT_tokenization(df, text_column)  # Tokenización de los textos

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    predictions = []

    for batch in tqdm(dataloader, desc="Predicting"):
        b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            predictions.extend(logits.detach().cpu().numpy())

    return np.array(predictions)

# Obtener predicciones
predictions = predict_vad(df1, "text")

# Crear un DataFrame con las predicciones de VAD
pred_df = df1.copy()
pred_df[["V_pred", "A_pred", "D_pred"]] = predictions

# Decodificar las predicciones de VAD a intensidades emocionales
emotion_labels = list(vad_values2.keys())
vad_matrix = np.array([[vad["V"], vad["A"], vad["D"]] for vad in vad_values2.values()])

def decode_vad_to_intensities(v, a, d):
    input_vad = np.array([v, a, d])
    distances = np.linalg.norm(vad_matrix - input_vad, axis=1)

    # Invertir las distancias para calcular intensidades proporcionales
    max_distance = np.max(distances)
    inverted_distances = max_distance - distances
    intensities = (inverted_distances / inverted_distances.sum()) * 3  # Escalar a 0-3
    intensities = np.round(intensities).astype(int)  # Redondear a enteros
    return pd.Series(intensities, index=emotion_labels)

# Aplicar la decodificación a las predicciones
decoded_intensities = pred_df.apply(
    lambda row: decode_vad_to_intensities(row["V_pred"], row["A_pred"], row["D_pred"]),
    axis=1
)

# Añadir las intensidades decodificadas al DataFrame
for emotion in emotion_labels:
    pred_df[emotion] = decoded_intensities[emotion]

# Guardar el DataFrame con las predicciones y las intensidades decodificadas
pred_df.to_csv("predicciones_decodificadas.csv", index=False)
print("Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.")

# Visualizar las primeras filas del DataFrame resultante
pred_df


Predicting: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:13<00:00,  1.10it/s]

Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.





Unnamed: 0,id,text,V_pred,A_pred,D_pred,Anger,Fear,Joy,Sadness,Surprise
0,eng_dev_track2_001,"I have a floor shift in the morning, hopefully...",1.003434,0.891863,-0.302177,0,1,0,1,1
1,eng_dev_track2_002,What is it about this winter that is making me...,0.621793,0.976994,-0.928572,0,1,0,1,1
2,eng_dev_track2_003,"Longest, most awkward drive I've ever taken.",0.839219,0.660034,-0.784878,0,1,0,1,1
3,eng_dev_track2_004,"I know not why, I wipe my face.",0.421110,-0.095308,-1.237276,0,1,0,1,0
4,eng_dev_track2_005,"And I laughed like this: garhahagar, because m...",1.796416,1.809361,0.109786,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
111,eng_dev_track2_112,My heart sank.,0.289251,-0.641791,-1.465758,0,1,0,2,0
112,eng_dev_track2_113,I remember the sweat burning in my eyes and tr...,0.788755,1.110890,-1.007916,0,1,0,1,1
113,eng_dev_track2_114,My sister was walking backwards and bumped her...,0.690792,1.230995,-0.613613,0,1,0,1,1
114,eng_dev_track2_115,"I can't breathe right, my head has been stuffe...",0.520161,-0.014261,-1.404953,0,1,0,1,0


In [15]:
# Lista de columnas a excluir
columns_to_exclude = ['V_pred', 'A_pred', 'D_pred']  # Reemplaza con los nombres de las columnas que deseas excluir

# Crear un nuevo DataFrame sin las columnas especificadas
pred_df_filtered = pred_df.drop(columns=columns_to_exclude)

# Verificar el resultado
print(pred_df_filtered.head())

                   id                                               text  \
0  eng_dev_track2_001  I have a floor shift in the morning, hopefully...   
1  eng_dev_track2_002  What is it about this winter that is making me...   
2  eng_dev_track2_003       Longest, most awkward drive I've ever taken.   
3  eng_dev_track2_004                    I know not why, I wipe my face.   
4  eng_dev_track2_005  And I laughed like this: garhahagar, because m...   

   Anger  Fear  Joy  Sadness  Surprise  
0      0     1    0        1         1  
1      0     1    0        1         1  
2      0     1    0        1         1  
3      0     1    0        1         0  
4      0     1    1        0         1  


In [16]:
# Reorganizar las columnas
columns_order = ['id','Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_df_filtered = pred_df_filtered[columns_order]

# Guardar el DataFrame reorganizado en un nuevo archivo CSV
pred_df_filtered.to_csv("pred_eng_b.csv", index=False)

# Confirmar que el archivo ha sido guardado
print("El archivo 'pred_eng.csv' se ha guardado con las columnas reorganizadas.")


El archivo 'pred_eng.csv' se ha guardado con las columnas reorganizadas.


In [17]:
import os

# Verificar si el archivo existe en el directorio actual
print("modelo_entrenado.pth existe:", os.path.exists('modelo_entrenado.pth'))


modelo_entrenado.pth existe: True


In [18]:
from IPython.display import FileLink

# Crear un enlace para descargar el archivo
FileLink('modelo_entrenado.pth')


In [19]:
#DATOS DE TEST

In [20]:
import pandas as pd

# Cargar el archivo CSV
df3 = pd.read_csv("eng.csv")

# Verificar la estructura del archivo
df3

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
0,eng_test_track_b_00001,/ o \ So today I went in for a new exam with D...,,,,,
1,eng_test_track_b_00002,The image I have in my mind is this: a group o...,,,,,
2,eng_test_track_b_00003,"I slammed my fist against the door and yelled,...",,,,,
3,eng_test_track_b_00004,I could not unbend my knees.,,,,,
4,eng_test_track_b_00005,"I spent the night at the hotel, mostly hanging...",,,,,
...,...,...,...,...,...,...,...
2762,eng_test_track_b_02763,Better late then never!,,,,,
2763,eng_test_track_b_02764,"In the last three weeks, I have started lookin...",,,,,
2764,eng_test_track_b_02765,"But I never fell out, so it wasn't a problem.",,,,,
2765,eng_test_track_b_02766,""" So I will remain positive for as long as I l...",,,,,


In [21]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

# Cargar los datos del archivo CSV
df4 = pd.read_csv("eng.csv")

# Predecir VAD usando el modelo entrenado
def predict_vad(df, text_column):
    model.eval()  # Cambiar el modelo a modo evaluación
    input_ids, attention_masks = BERT_tokenization(df, text_column)  # Tokenización de los textos

    dataset = TensorDataset(input_ids, attention_masks)
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

    predictions = []

    for batch in tqdm(dataloader, desc="Predicting"):
        b_input_ids, b_attention_mask = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_mask)
            logits = outputs.logits
            predictions.extend(logits.detach().cpu().numpy())

    return np.array(predictions)

# Obtener predicciones
predictions = predict_vad(df1, "text")

# Crear un DataFrame con las predicciones de VAD
pred_df1 = df4.copy()
pred_df1[["V_pred", "A_pred", "D_pred"]] = predictions

# Decodificar las predicciones de VAD a intensidades emocionales
emotion_labels = list(vad_values2.keys())
vad_matrix = np.array([[vad["V"], vad["A"], vad["D"]] for vad in vad_values2.values()])

def decode_vad_to_intensities(v, a, d):
    input_vad = np.array([v, a, d])
    distances = np.linalg.norm(vad_matrix - input_vad, axis=1)

    # Invertir las distancias para calcular intensidades proporcionales
    max_distance = np.max(distances)
    inverted_distances = max_distance - distances
    intensities = (inverted_distances / inverted_distances.sum()) * 3  # Escalar a 0-3
    intensities = np.round(intensities).astype(int)  # Redondear a enteros
    return pd.Series(intensities, index=emotion_labels)

# Aplicar la decodificación a las predicciones
decoded_intensities = pred_df1.apply(
    lambda row: decode_vad_to_intensities(row["V_pred"], row["A_pred"], row["D_pred"]),
    axis=1
)

# Añadir las intensidades decodificadas al DataFrame
for emotion in emotion_labels:
    pred_df1[emotion] = decoded_intensities[emotion]

# Guardar el DataFrame con las predicciones y las intensidades decodificadas
pred_df1.to_csv("predicciones_decodificadas.csv", index=False)
print("Archivo 'predicciones_decodificadas.csv' con intensidades decodificadas generado exitosamente.")

# Visualizar las primeras filas del DataFrame resultante
pred_df1


Predicting: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:13<00:00,  1.10it/s]


ValueError: Length of values (116) does not match length of index (2767)

In [None]:
# Lista de columnas a excluir
columns_to_exclude = ['V_pred', 'A_pred', 'D_pred']  # Reemplaza con los nombres de las columnas que deseas excluir

# Crear un nuevo DataFrame sin las columnas especificadas
pred_df_filtered = pred_df1.drop(columns=columns_to_exclude)

# Verificar el resultado
print(pred_df_filtered.head())

In [None]:
# Reorganizar las columnas
columns_order = ['id','Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
pred_df_filtered = pred_df_filtered[columns_order]

# Guardar el DataFrame reorganizado en un nuevo archivo CSV
pred_df_filtered.to_csv("pred_eng_b.csv", index=False)

# Confirmar que el archivo ha sido guardado
print("El archivo 'pred_eng.csv' se ha guardado con las columnas reorganizadas.")
