In [1]:
import os
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaTokenizer
import torch
from sklearn.preprocessing import MinMaxScaler
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import re
import copy
import pprint
import time

MAX_NO_OF_SPEAKERS = 8
MAX_DIALOGUE_LEN   = 33
MAX_SEQUENCE_LEN   = 24
original_labels    = ['abuse', 'adoration', 'annoyance', 'awkwardness', 'benefit', 'boredom', 'calmness', 'challenge', 'cheer', 'confusion', 'curiosity', 'desire', 'excitement', 'guilt', 'horror', 'humour', 'impressed', 'loss', 'nervousness', 'nostalgia', 'pain', 'relief', 'satisfaction', 'scold', 'shock', 'sympathy', 'threat']
train_count        = [31, 190, 1051, 880, 220, 78, 752, 214, 534, 486, 545, 180, 867, 216, 280, 153, 257, 351, 398, 65, 36, 173, 136, 94, 372, 209, 263]

EMOTIONS           = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

sent_model = 'roberta-base-nli-stsb-mean-tokens'

print('tr version', transformers.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device => ",device, ' torch ', torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


tr version 4.35.2
Using device =>  cpu  torch  2.1.1+cu121


In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
test_csv = pd.read_json("EDiReF-Test-Data/MELD_test_efr.json")
train_csv = pd.read_json("EDiReF-Train-Data/Task 3/MELD_train_efr.json")

In [4]:
train_df = pd.DataFrame(train_csv)
train_df

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...,...
3995,utterance_3995,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3996,utterance_3996,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3997,utterance_3997,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3998,utterance_3998,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [5]:
dummies = pd.get_dummies(EMOTIONS)
dummies['anger']

0     True
1    False
2    False
3    False
4    False
5    False
6    False
Name: anger, dtype: bool

In [6]:
listaEmo = []
for i in train_df['emotions']:
    listtemp = []
    for j in i:
        listtemp.append(dummies[j])
    listaEmo.append(listtemp)

In [7]:
train_df['emotions'] = listaEmo
train_df

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[[False, False, False, False, True, False, Fal...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[[False, False, False, False, True, False, Fal...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[[False, False, False, False, True, False, Fal...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[[False, False, False, False, True, False, Fal...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[[False, False, False, False, False, False, Tr...",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...,...
3995,utterance_3995,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[[False, False, False, False, True, False, Fal...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3996,utterance_3996,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[[False, False, False, False, True, False, Fal...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3997,utterance_3997,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[[False, False, False, False, True, False, Fal...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3998,utterance_3998,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[[False, False, False, False, True, False, Fal...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
def speaker_transform(df: pd.DataFrame) -> list:
    listasp = []
    for conversation in df:
        listSpk = sorted(set(conversation))
        zejo = np.zeros((len(conversation),MAX_NO_OF_SPEAKERS))
        for i in range(len(conversation)):
            zejo[i][listSpk.index(conversation[i])] = 1
        # Crear un DataFrame de variables dummy para los speakers de esta conversación
        # Almacenar el DataFrame en la lista
        listasp.append(zejo)
    return listasp


In [9]:
train_df['speakers'] = speaker_transform(train_df['speakers'])

In [10]:
i = 0
sentence_embeddings = []
    # sent_emb = model.encode('')
if not os.path.exists('df_sent.pkl'):
    while i < len(train_df):
        utt = train_df['utterances'][i]
        utt_emb = model.encode(utt)
        # utt_emb = model.encode(utt)
        sent_emb = utt_emb
        i += 1
        sentence_embeddings.append(copy.deepcopy(sent_emb))
else:
    with open('df_sent.pkl', 'rb') as f:
        sentence_embeddings = pickle.load(f)


In [11]:
train_df['sentence_embeddings'] = sentence_embeddings

In [12]:
csvread = pd.read_csv("./EDiReF-Train-Data/Task 3/out.csv",names=["Valence", "Arousal", "Dominance"])
print(csvread)

           Valence  Arousal  Dominance
aaaaaaah     0.479    0.606      0.291
aaaah        0.520    0.636      0.282
aardvark     0.427    0.490      0.437
aback        0.385    0.407      0.288
abacus       0.510    0.276      0.485
...            ...      ...        ...
zoo          0.760    0.520      0.580
zoological   0.667    0.458      0.492
zoology      0.568    0.347      0.509
zoom         0.490    0.520      0.462
zucchini     0.510    0.321      0.250

[19971 rows x 3 columns]


In [13]:
import re
from collections import defaultdict

track = defaultdict(list)

for i in train_df['utterances']:
    for sentence in i:
        sentence = sentence.lower().split()
        for word in sentence:
            cleaned_word = re.sub(r'[^a-zA-Z]', '', word)
            if cleaned_word in csvread.index and cleaned_word not in track:
                track[cleaned_word].append(csvread['Valence'][cleaned_word])
                #track[cleaned_word].append(csvread['Arousal'][cleaned_word])
                #track[cleaned_word].append(csvread['Dominance'][cleaned_word])
                

# Ahora, track contendrá las palabras limpias como claves y listas de diccionarios como valores, 
# donde cada diccionario contiene las propiedades Valence, Arousal y Dominance para esa palabra.


In [14]:
max_word_count = max(
    len(sentence.lower().split())
    for utterances in train_df['utterances']
    for sentence in utterances
)
max_word_count

69

In [15]:
valen = []
#aros = []
#domi = []

for utterances in train_df['utterances']:
    valen.append([[float(track[re.sub(r'[^a-zA-Z]', '', word)][0]) if re.sub(r'[^a-zA-Z]', '', word) in track else 0 for word in sentence.lower().split()] for sentence in utterances])
    #aros.append([[float(track[re.sub(r'[^a-zA-Z]', '', word)][1]) if re.sub(r'[^a-zA-Z]', '', word) in track else 0 for word in sentence.lower().split()] for sentence in utterances])
    #domi.append([[float(track[re.sub(r'[^a-zA-Z]', '', word)][2]) if re.sub(r'[^a-zA-Z]', '', word) in track else 0 for word in sentence.lower().split()] for sentence in utterances])


In [16]:
def pad_sequences_mean(sequences, max_len):
    padded_sequences = np.zeros((max_len))
    mask = np.ones((max_len))

    for i, sentence in enumerate(sequences):
        padded_sequences[i] = sentence
        mask[i] = 0
    
    return padded_sequences, mask

In [17]:
padded_valen = [[pad_sequences_mean(sentence, 69)[0] for sentence in utterances] for utterances in valen]
mask_valen = [[pad_sequences_mean(sentence, 69)[1] for sentence in utterances] for utterances in valen]

In [18]:
#meanlist = [[[np.mean(l), np.mean(r), np.mean(o)] for l, r ,o in zip(v, a, d)] for v, a, d in zip(valen, aros, domi)]

In [19]:
#hstack_meanlist = [np.hstack((v, a, d)) for v, a, d in zip(temp_valen, temp_aros, temp_domi)]

In [20]:
def pad_sequences(sequences: pd.DataFrame, num_features = 0, max_len = MAX_SEQUENCE_LEN , only_mask = False):
    if only_mask:
        mask = np.ones((max_len))
        mask[:len(sequences)] = 0
        return mask
    else:
        zejo = np.zeros((max_len, num_features)) if num_features != 0 else np.zeros((max_len))
        zejo[:len(sequences)] = sequences
        return zejo

In [21]:
padded_speakers = [pad_sequences(sequence, 8) for sequence in train_df['speakers']]
padded_emotions = [pad_sequences(sequence, 7) for sequence in train_df['emotions']]
padded_sentence_embeddings = [pad_sequences(sequence, 768) for sequence in train_df['sentence_embeddings']]
padded_trigger = [pad_sequences(sequence) for sequence in train_df['triggers']]
mask = [pad_sequences(sequence,only_mask=True) for sequence in train_df['sentence_embeddings']]

In [22]:
train_df['speakers'] = padded_speakers
train_df['emotions'] = padded_emotions
train_df['sentence_embeddings'] = padded_sentence_embeddings
train_df['valence'] = padded_valen
train_df['valence_mask'] = mask_valen
train_df['triggers'] = padded_trigger
train_df['mask'] = mask

In [23]:
#pickle.dump(train_df,open('test_df.pkl', 'wb'))

In [24]:
train_df['triggers'] = train_df['triggers'].apply(lambda x: [0 if pd.isna(item) else item for item in x])

In [25]:
pickle.dump(train_df,open('train_df.pkl', 'wb'))

In [None]:
nanlist = np.full(len(meanT[0]), np.nan)
nanlist[2] = 5
nanlist

array([nan, nan,  5., nan, nan])

In [None]:
import copy
lista = []

for i in range(len(train_df['speakers'])):
    diccionario = {}
    nanlist = np.full(len(meanT[i]), np.nan)
    
    for j, speaker in enumerate(train_df['speakers'][i]):
        if speaker in diccionario:
            diccionario[speaker][j] = meanT[i][j]
        else:
            nanlist_copy = copy.copy(nanlist)  # Crea una copia independiente de nanlist
            nanlist_copy[j] = meanT[i][j]
            diccionario[speaker] = nanlist_copy

    lista.append(list(diccionario.values()))



In [None]:
replaced_valance = [np.nan_to_num(x, nan=-1) for x in lista]
train_df['valence_speaker']=replaced_valance

In [None]:
train_df['valence_speaker'][174]

array([[ 0.19283333, -1.        ,  0.412875  , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ,  0.14285714,
        -1.        ,  0.255     , -1.        , -1.        ],
       [-1.        ,  0.501125  , -1.        , -1.        , -1.        ,
         0.        , -1.        ,  0.170125  , -1.        , -1.        ,
         0.        , -1.        , -1.        ,  0.32129412],
       [-1.        , -1.        , -1.        ,  0.1855    , -1.        ,
        -1.        , -1.        , -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ],
       [-1.        , -1.        , -1.        , -1.        ,  0.27875   ,
        -1.        ,  0.40166667, -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ],
       [-1.        , -1.        , -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        ,  0.4185    , -1.        ,
        -1.        , -1.  

In [None]:
train_df

In [None]:

i = 62
# Elegir un hablante específico y sus valores
speaker1_data = lista[i][0]
speaker2_data = lista[i][1]
# Crear una secuencia de índices para el eje x
x = train_df['triggers'][i]

# Crear una gráfica de líneas
plt.scatter( [i for i,x in enumerate(speaker1_data)],speaker1_data)
plt.scatter( [i for i,x in enumerate(speaker2_data)],speaker2_data)
plt.scatter( [i for i,x in enumerate(x)],x)
plt.xlabel('Índice')
plt.ylabel('Valor')
plt.title('Gráfica de Líneas del Hablante')

# Mostrar la gráfica
plt.show()
print(speaker1_data, speaker2_data)