In [2]:
import os
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, RobertaModel, RobertaTokenizer
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import re
import copy
import pprint
import time

MAX_NO_OF_SPEAKERS = 8
MAX_DIALOGUE_LEN   = 33
original_labels    = ['abuse', 'adoration', 'annoyance', 'awkwardness', 'benefit', 'boredom', 'calmness', 'challenge', 'cheer', 'confusion', 'curiosity', 'desire', 'excitement', 'guilt', 'horror', 'humour', 'impressed', 'loss', 'nervousness', 'nostalgia', 'pain', 'relief', 'satisfaction', 'scold', 'shock', 'sympathy', 'threat']
train_count        = [31, 190, 1051, 880, 220, 78, 752, 214, 534, 486, 545, 180, 867, 216, 280, 153, 257, 351, 398, 65, 36, 173, 136, 94, 372, 209, 263]

EMOTIONS           = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

sent_model = 'roberta-base-nli-stsb-mean-tokens'

print('tr version', transformers.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device => ",device, ' torch ', torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


tr version 4.35.2
Using device =>  cpu  torch  2.1.1+cu121


In [3]:
class EmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(EmotionClassifier, self).__init__()
        self.bert = RobertaModel.from_pretrained('roberta-base')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        op = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        output = self.drop(op[1])
        return self.out(output), op[1]

# load finetuned roberta model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_finetuned = EmotionClassifier(7).to(device)
#roberta_tf_checkpoint = torch.load('dump_files/finetuned/best_model_state_roberta.bin', map_location=torch.device(device))
#roberta_finetuned.load_state_dict(roberta_tf_checkpoint)
print('model loaded')


# Helper functions



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model loaded


In [4]:
train_csv = pd.read_json("EDiReF-Train-Data/Task 3/MELD_train_efr.json")

In [5]:
train_df = pd.DataFrame(train_csv)
train_df

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...,...
3995,utterance_3995,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3996,utterance_3996,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3997,utterance_3997,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3998,utterance_3998,"[Chandler, All, Monica, Chandler, Ross, Chandl...","[neutral, joy, neutral, neutral, surprise, dis...","[Hey., Hey!, So how was Joan?, I broke up with...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [6]:
print(train_df['emotions'][0], train_df['speakers'][0])

['neutral', 'neutral', 'neutral', 'neutral', 'surprise'] ['Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler']


In [7]:
dummies = pd.get_dummies(EMOTIONS)
dummies['anger']

0     True
1    False
2    False
3    False
4    False
5    False
6    False
Name: anger, dtype: bool

In [8]:
listaEmo = []
for i in train_df['emotions']:
    listtemp = []
    for j in i:
        listtemp.append(dummies[j])
    listaEmo.append(listtemp)

In [9]:
listSpk = []
for i in train_df['speakers']:
    for j in i:
        if j in listSpk:
            continue
        else:
            listSpk.append(j)
listSpk.sort()
speaker_specific = pd.get_dummies(listSpk)
speaker_specific

Unnamed: 0,1st Customer,2nd Customer,3rd Customer,A Female Student,A Student,Alice,All,Allesandro,Angela,Annabelle,...,Tour Guide,Trudie Styler,Ursula,Voice,Waiter,Wayne,Woman,Woman On Train,Young Ethan,an
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
227,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
228,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
229,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [10]:
# listasp = []
# for i in train_df['speakers']:
#     listatemp = []
#     for j in i:
#         listatemp.append(speaker_specific[j])
#     listasp.append(listatemp)

In [11]:
# train_df['speakers'] = listasp
# train_df

In [12]:
# i = 0
# sentence_embeddings = []
#     # sent_emb = model.encode('')
# while i < len(train_df):
#     utt = train_df['utterances'][i]
#     encodings = roberta_tokenizer.encode_plus(utt, max_length=100, padding = 'max_length', add_special_tokens=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, return_tensors='pt').to(device)
#     utt_emb = roberta_finetuned(encodings['input_ids'], encodings['attention_mask'])[1].detach().tolist()[0]
#     utt_emb = np.round(utt_emb, decimals = 10)
#     # utt_emb = model.encode(utt)
#     sent_emb = utt_emb
#     i += 1
#     sentence_embeddings.append(copy.deepcopy(sent_emb))


In [13]:
# train_df['sentence_embeddings'] = sentence_embeddings
# df_sent = pd.DataFrame(sentence_embeddings)

In [14]:
csvread = pd.read_csv("./EDiReF-Train-Data/Task 3/out.csv",names=["Valence", "Arousal", "Dominance"])
print(csvread)

           Valence  Arousal  Dominance
aaaaaaah     0.479    0.606      0.291
aaaah        0.520    0.636      0.282
aardvark     0.427    0.490      0.437
aback        0.385    0.407      0.288
abacus       0.510    0.276      0.485
...            ...      ...        ...
zoo          0.760    0.520      0.580
zoological   0.667    0.458      0.492
zoology      0.568    0.347      0.509
zoom         0.490    0.520      0.462
zucchini     0.510    0.321      0.250

[19971 rows x 3 columns]


In [15]:
import re
from collections import defaultdict

track = defaultdict(list)

for i in train_df['utterances']:
    for sentence in i:
        sentence = sentence.lower().split()
        for word in sentence:
            cleaned_word = re.sub(r'[^a-zA-Z]', '', word)
            if cleaned_word in csvread.index and cleaned_word not in track:
                track[cleaned_word].append(csvread['Valence'][cleaned_word])
                track[cleaned_word].append(csvread['Arousal'][cleaned_word])
                track[cleaned_word].append(csvread['Dominance'][cleaned_word])
                

# Ahora, track contendrá las palabras limpias como claves y listas de diccionarios como valores, 
# donde cada diccionario contiene las propiedades Valence, Arousal y Dominance para esa palabra.


In [54]:
valen = []
aros = []
domi = []
for i in train_df['utterances']:
    listVal = []
    listAro = []
    listDom = []
    for sentence in i:
        valence_sen = []
        arousal_sen = []
        dominance_sen = []
        sentence = sentence.lower().split()
        for word in sentence:
            cleaned_word = re.sub(r'[^a-zA-Z]', '', word)
            if cleaned_word in track:
                val, aro, dom = track[cleaned_word]
                valence_sen.append(float(val))
                arousal_sen.append(float(aro))
                dominance_sen.append(float(dom))
            else:
                valence_sen.append(0)
                arousal_sen.append(0)
                dominance_sen.append(0)
        listVal.append(valence_sen)
        listAro.append(arousal_sen)
        listDom.append(dominance_sen)
    valen.append(listVal)
    aros.append(listAro)
    domi.append(listDom)

In [41]:
meanT = []
for i in valen:
    mean = []
    for j in i:
        mean.append(np.mean(j))
    meanT.append(mean)

In [56]:
print(len(valen),valen[3999])

train_df['utterances'][3999]

4000 [[0], [0], [0, 0, 0, 0], [0, 0.245, 0, 0, 0], [0, 0.551, 0, 0, 0, 0, 0.802, 0.396, 0.449], [0, 0, 0.573, 0, 0, 0, 0, 0, 0, 0, 0], [0.529, 0, 0, 0, 0, 0, 0.573], [0, 0, 0, 0, 0.57, 0.438, 0, 0, 0.635, 0, 0.667], [0, 0, 0, 0.823, 0, 0, 0, 0, 0.06, 0, 0, 0, 0.357, 0.083, 0.519], [0.448, 0, 0.448, 0], [0, 0, 0.439, 0, 0.51, 0, 0, 0.542], [0, 0, 0.625, 0, 0, 0, 0.847, 0, 0, 0, 0, 0.806, 0, 0.49, 0, 0, 0, 0, 0.49, 0.958, 0.844, 0, 0, 0, 0, 0, 0, 0.811], [0, 0, 0, 0.594], [0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0.757, 0, 0], [0, 0, 0, 0, 0, 0, 0.74], [0]]


['Hey.',
 'Hey!',
 'So how was Joan?',
 'I broke up with her.',
 "Don't tell me, because of the big nostril thing?",
 'They were huge. When she sneezed, bats flew out of them.',
 'Come on, they were not that huge.',
 "I'm tellin' you, she leaned back; I could see her brain.",
 'How many perfectly fine women are you gonna reject over the most superficial insignificant things?',
 'Hold it, hold it.',
 'I gotta side with Chandler on this one.',
 "When I first moved to the city, I went out a couple of times with this girl, really hot, great kisser, but she had the biggest Adam's apple.",
 'It made me nuts.',
 'You or me?',
 "I got it. Uh, Joey, women don't have Adam's apples.",
 'You guys are messing with me, right?',
 'Yeah.']

In [57]:
valen_padded = [[subsublista + [0.0] * (MAX_DIALOGUE_LEN - len(subsublista)) for subsublista in sublista] for sublista in valen]

In [63]:
train_df = pickle.load(open('train_df.pkl', 'rb'))
df_sent = pickle.load(open('df_sent (1).pkl', 'rb'))

In [66]:
train_df['valence'] = valen_padded
train_df['sentence_embeddings'] = df_sent
train_df.drop(columns=['valence_speaker'], inplace=True)

In [69]:
pickle.dump(train_df,open('train_df.pkl', 'wb'))

In [42]:
nanlist = np.full(len(meanT[0]), np.nan)
nanlist[2] = 5
nanlist

array([nan, nan,  5., nan, nan])

In [44]:
import copy
lista = []

for i in range(len(train_df['speakers'])):
    diccionario = {}
    nanlist = np.full(len(meanT[i]), np.nan)
    
    for j, speaker in enumerate(train_df['speakers'][i]):
        if speaker in diccionario:
            diccionario[speaker][j] = meanT[i][j]
        else:
            nanlist_copy = copy.copy(nanlist)  # Crea una copia independiente de nanlist
            nanlist_copy[j] = meanT[i][j]
            diccionario[speaker] = nanlist_copy

    lista.append(list(diccionario.values()))



In [45]:
replaced_valance = [np.nan_to_num(x, nan=-1) for x in lista]
train_df['valence_speaker']=replaced_valance

In [46]:
train_df['valence_speaker'][174]

array([[ 0.19283333, -1.        ,  0.412875  , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ,  0.14285714,
        -1.        ,  0.255     , -1.        , -1.        ],
       [-1.        ,  0.501125  , -1.        , -1.        , -1.        ,
         0.        , -1.        ,  0.170125  , -1.        , -1.        ,
         0.        , -1.        , -1.        ,  0.32129412],
       [-1.        , -1.        , -1.        ,  0.1855    , -1.        ,
        -1.        , -1.        , -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ],
       [-1.        , -1.        , -1.        , -1.        ,  0.27875   ,
        -1.        ,  0.40166667, -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ],
       [-1.        , -1.        , -1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        ,  0.4185    , -1.        ,
        -1.        , -1.  

In [None]:
train_df

In [None]:

i = 62
# Elegir un hablante específico y sus valores
speaker1_data = lista[i][0]
speaker2_data = lista[i][1]
# Crear una secuencia de índices para el eje x
x = train_df['triggers'][i]

# Crear una gráfica de líneas
plt.scatter( [i for i,x in enumerate(speaker1_data)],speaker1_data)
plt.scatter( [i for i,x in enumerate(speaker2_data)],speaker2_data)
plt.scatter( [i for i,x in enumerate(x)],x)
plt.xlabel('Índice')
plt.ylabel('Valor')
plt.title('Gráfica de Líneas del Hablante')

# Mostrar la gráfica
plt.show()
print(speaker1_data, speaker2_data)