In [55]:
import os
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import lightning as L
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import pickle
import re
import copy
import time

# Scoring
from sklearn.metrics import classification_report, f1_score
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device => ",device, ' torch ', torch.__version__)


# hyper parameters
SEED = 42
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    
#@title Hyper Parameters { display-mode: "both" }

EPOCHS             = 20
MAX_NO_OF_SPEAKERS = 231
MAX_DIALOGUE_LEN   = 33
original_labels    = ['abuse', 'adoration', 'annoyance', 'awkwardness', 'benefit', 'boredom', 'calmness', 'challenge', 'cheer', 'confusion', 'curiosity', 'desire', 'excitement', 'guilt', 'horror', 'humour', 'impressed', 'loss', 'nervousness', 'nostalgia', 'pain', 'relief', 'satisfaction', 'scold', 'shock', 'sympathy', 'threat']
train_count        = [31, 190, 1051, 880, 220, 78, 752, 214, 534, 486, 545, 180, 867, 216, 280, 153, 257, 351, 398, 65, 36, 173, 136, 94, 372, 209, 263]

EMOTIONS           = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

# DataLoader Hyperparamaters
BATCH_SIZE = 64

# Module 1 hyperparamaters(speaker_specific_emotion_sequence) : GRU n-n
input_size_1  = 7
hidden_size_1 = 10 
num_layers_1  = 2 
output_size_1 = 10


# Module 2 hyperparamaters(utterance_context) : Transformer Enc
input_size_2 = 768
n_head_2     = 4
dm_ff_2      = 2048
dp_2         = 0.2
num_layers_2 = 4 
act_fn_2     = 'relu'

# Module 3 hyperparamaters(speaker_context) : Transformer Enc
input_size_3 = 231
n_head_3     = 3
dm_ff_3      = 2048
dp_3         = 0.2
num_layers_3 = 4 
act_fn_3     = 'relu'

# Module 4 hyperparamaters(global_emotion_sequence) : GRU
input_size_4  = 7
hidden_size_4 = 10 
num_layers_4  = 2 
output_size_4 = 7

# Module 5 hyperparamaters(valence) : Transformer Enc
input_size_5 = 69
n_head_5     = 3
dm_ff_5      = 2048
dp_5         = 0.2
num_layers_5 = 4 
act_fn_5     = 'relu'

# Module 6 hyperparamaters(speaker_specific_valence_sequence) : GRU
input_size_6  = 1
hidden_size_6 = 10
num_layers_6  = 2
output_size_6 = 10

# Final Model Hyperparamerters:
fc1_out = 800
fc2_out = 800
fc3_out = 400
fc4_out = 100
fc5_out = 1

learning_rate = 0.0001

Using device =>  cpu  torch  2.1.0+cu121


In [56]:
with open('train_df.pkl', 'rb') as f:
    train_df = pickle.load(f)

In [57]:
class SemEvalDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.len = len(self.data)
        print(list(train_df.columns))
        
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        dict_x = {}
        dict_x['speaker'] = torch.tensor(self.data['speakers'][index], dtype=torch.float32)
        dict_x['emotion'] = torch.tensor(self.data['emotions'][index], dtype=torch.float32)
        dict_x['sentence_embeddings'] = torch.tensor(self.data['sentence_embeddings'][index], dtype=torch.float32)
        dict_x['valence'] = torch.tensor(self.data['valence'][index], dtype=torch.float32)

        dict_y = {}
        dict_y['triggers'] =  torch.tensor(self.data['triggers'][index], dtype=torch.float32)

        return dict_x, dict_y

In [58]:
dataset = SemEvalDataset(train_df)

['episode', 'speakers', 'emotions', 'utterances', 'triggers', 'sentence_embeddings', 'valence']


In [59]:
from torch.utils.data import random_split
total_size = len(dataset)
train_ratio = 0.8
val_ratio = 0.2

train_size = int(total_size * train_ratio)
val_size = int(total_size * val_ratio)

# Dividir el conjunto de datos
train_data, val_data = random_split(dataset, [train_size, val_size])

In [60]:
class MELDCollate:
    def __init__(self, pad_value = 0):
        self.pad_value = pad_value
    def __call__(self, batch):
        speaker             = pad_sequence([item[0]['speaker'] for item in batch], batch_first = True)
        emotion             = pad_sequence([item[0]['emotion'] for item in batch], batch_first = True)
        sentence_embeddings = pad_sequence([item[0]['sentence_embeddings'] for item in batch], batch_first = True)
        valence             = pad_sequence([item[0]['valence'] for item in batch], batch_first = True)
        # print('\noriginal list : ',[item[0]['speaker'] for item in batch], '\n\npadded list : ', speaker)
        labels              = pad_sequence([item[1]['triggers'] for item in batch], batch_first = True)

        dict_x = { 'speaker': speaker, 'emotion':emotion,  'sentence_embeddings':sentence_embeddings, 'valence':valence}
        dict_y = {'labels': labels}

        return dict_x, dict_y

In [61]:
train_loader  = DataLoader(dataset = train_data, batch_size = 64, shuffle=True, collate_fn= MELDCollate())
val_loader    = DataLoader(dataset = val_data, batch_size = 64, shuffle=True, collate_fn= MELDCollate())

In [62]:
desired_batch_index = 4
for i, batch in enumerate(train_loader):
    if i == desired_batch_index:
        # 'batch' contendrá el batch en el índice especificado
        print(f"Batch {i}:")
        print(batch[1]['labels'].shape)
        break

Batch 4:
torch.Size([64, 22])


In [63]:
tesst = train_data.__getitem__(1)
print(tesst[0]['speaker'].shape, tesst[0]['emotion'].shape, tesst[0]['sentence_embeddings'].shape, tesst[0]['valence'].shape)
# print(tesst[0]['valence'].shape)
# print(tesst[0]['speaker'].size())
# test = tesst[0]['speaker'].unique(dim=0, return_inverse=True)[1]
# [torch.where(test == i, tesst[0]['valence'].mean(1), 0) for i in test.unique()]

torch.Size([7, 231]) torch.Size([7, 7]) torch.Size([7, 768]) torch.Size([7, 69])


In [64]:
class Module6GRU(nn.Module):
    def __init__(self, input_size, num_layers, hidden_size, output_size):
        super(Module6GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.output_size = output_size
        # Since there are maximum of 8 speakers in a dialogue, so we decided to make 8 GRUs one for each speaker.
        self.gru_list= []
        for id in range(MAX_NO_OF_SPEAKERS):
            self.gru_list.append(nn.GRU(input_size, hidden_size, num_layers, batch_first = True))
        self.gru_modules = nn.ModuleList(self.gru_list)

    def valence_specific(self, valence, speaker):
        speaker = speaker.unique(dim = 0, return_inverse=True)[1]
        return [torch.where(speaker == i , valence.mean(1), 0) for i in speaker.unique()]

    def applyGRU(self, speaker_valence, seq_len):
        speaker_output = torch.zeros(seq_len, self.output_size)

        for sp_idx, valence in enumerate(speaker_valence):
            # Verificar si hay alguna entrada para este hablante
            if valence.nonzero().size(0) == 0:
                continue

            # Asegúrate de que valence tenga al menos dos dimensiones
            valence = valence.unsqueeze(1)

            # Inicializar h0 como un tensor 2D
            h0 = torch.zeros(self.num_layers, self.hidden_size)  # Ahora h0 es 2D

            out, _ = self.gru_modules[sp_idx](valence, h0)

            # Rellenar speaker_output con la salida correspondiente
            for uid, output in enumerate(out.squeeze(0)):
                speaker_output[uid] = output

        return speaker_output


    def forward(self, x, speakers):
        batch_size = x.size(0)
        seq_len    = x.size(1)
        outputs = []
        for i in range(batch_size):
            speaker_specific = self.valence_specific(x[i], speakers[i])
            out = self.applyGRU(speaker_specific, seq_len)
            outputs.append(out)
        
        final_output = torch.cat([outputs[i].unsqueeze(2) for i in range(len(outputs))], 2).permute(2,0,1)
        
        return final_output

In [65]:
class Module5TransformerEnc(nn.Module):
    # S, N, E : (seq_len, batch_size, input/embedding_size)
    def __init__(self, input_size, n_head, dim_ff, dp, num_layers, act_fn = 'relu'):
        super(Module5TransformerEnc, self).__init__()
        self.input_size = input_size
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model = input_size, nhead = n_head, dim_feedforward = dim_ff, dropout=dp, activation=act_fn)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

    def make_src_mask(self, src): # src_shape : (S, N, E)
        pad_value = torch.zeros(self.input_size).to(device)
        # pad_value shape : (E), value : [0,0,0, ...]
        src = src.transpose(0,1)
        # src_shape : (N, S, E)

        src_mask = torch.all(torch.eq(src,pad_value),2)
        
        # src_mask shape : (N, S), value : for each batch, it is contains seq_len sized tensors and contains true for pad and false for others
        return src_mask

    def forward(self, x):
        # x shape: seq_len, batch_size, input_size 
        # Since batch_first is not a parameter in trasformer so the input must be S, N, E
        x_mask = self.make_src_mask(x)
        out = self.encoder(x, src_key_padding_mask = x_mask)  
        # out shape : (S, N, E)
        return out

In [66]:
class Module4GRU(nn.Module):
    def __init__(self, input_size, num_layers, hidden_size, output_size):
        super(Module4GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True)
        self.fc  = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        print('x shape : ', x.shape)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.gru(x, h0)
        
        # shape of out :  (N, seq_len, hidden_size)     (torch.Size([10, 33, 8])) 
        # shape of hn  :  (num_layers, N, hidden_size)     (torch.Size([2, 10, 8]))
        # shape of hn  :  (N, num_layers, hidden_size) and then flatten it to (N, num_layers*hiddem_size) 3D to 2D
        output = self.fc(out)
        # shape of output : [N, output_size]

        return output

In [67]:
class Module3TransformerEnc(nn.Module):
    # S, N, E : (seq_len, batch_size, input/embedding_size)
    def __init__(self, input_size, n_head, dim_ff, dp, num_layers, act_fn = 'relu'):
        super(Module3TransformerEnc, self).__init__()
        self.input_size = input_size
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model = input_size, nhead = n_head, dim_feedforward = dim_ff, dropout=dp, activation=act_fn)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

    def make_src_mask(self, src): # src_shape : (S, N, E)
        pad_value = torch.zeros(self.input_size).to(device)
        # pad_value shape : (E), value : [0,0,0, ...]
        src = src.transpose(0,1)
        # src_shape : (N, S, E)

        src_mask = torch.all(torch.eq(src,pad_value),2)
        
        # src_mask shape : (N, S), value : for each batch, it is contains seq_len sized tensors and contains true for pad and false for others
        return src_mask

    def forward(self, x):
        # x shape: seq_len, batch_size, input_size 
        # Since batch_first is not a parameter in trasformer so the input must be S, N, E
        x_mask = self.make_src_mask(x)
        out = self.encoder(x, src_key_padding_mask = x_mask)  
        # out shape : (S, N, E)
        return out

In [68]:
class Module2TransformerEnc(nn.Module):
    # S, N, E : (seq_len, batch_size, input/embedding_size)
    def __init__(self, input_size, n_head, dim_ff, dp, num_layers, act_fn = 'relu'):
        super(Module2TransformerEnc, self).__init__()
        self.input_size = input_size
        
        self.encoder_layer = nn.TransformerEncoderLayer(d_model = input_size, nhead = n_head, dim_feedforward = dim_ff, dropout=dp, activation=act_fn)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

    def make_src_mask(self, src): # src_shape : (S, N, E)
        pad_value = torch.zeros(self.input_size)
        # pad_value shape : (E), value : [0,0,0, ...]
        src = src.transpose(0,1)
        # src_shape : (N, S, E)

        src_mask = torch.all(torch.eq(src,pad_value),2)
        
        # src_mask shape : (N, S), value : for each batch, it is contains seq_len sized tensors and contains true for pad and false for others
        return src_mask

    def forward(self, x):
        # x shape: seq_len, batch_size, input_size 
        # Since batch_first is not a parameter in trasformer so the input must be S, N, E
        x_mask = self.make_src_mask(x)
        out = self.encoder(x, src_key_padding_mask = x_mask)  
        # out shape : (S, N, E)
        return out

In [69]:
class Module1GRU(nn.Module):
    def __init__(self, input_size, num_layers, hidden_size, output_size):
        super(Module1GRU, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.output_size = output_size
        # Since there are maximum of 8 speakers in a dialogue, so we decided to make 8 GRUs one for each speaker.
        self.gru_list= []
        for id in range(MAX_NO_OF_SPEAKERS):
            self.gru_list.append(nn.GRU(input_size, hidden_size, num_layers, batch_first = True))
        self.gru_modules = nn.ModuleList(self.gru_list)
        # self.fc  = nn.Linear(num_layers*hidden_size, output_size)
            
    
    def segregateEmotions(self, emotions, speakers):
        speaker_specific = []
        utt_id = []
        for i in range(MAX_NO_OF_SPEAKERS):
            speaker_tensor = torch.zeros(MAX_NO_OF_SPEAKERS, dtype = float)
            speaker_tensor[i] = 1
            emo = emotions[torch.nonzero((speakers == speaker_tensor).sum(dim=1) == speakers.size(1))].permute(1,0,2)
            if(emo.size(1) == 0):
                continue
            utt_id.append(torch.nonzero((speakers == speaker_tensor).sum(dim=1) == speakers.size(1))[0])
            speaker_specific.append(emo)
#             print('\n emo size : ',emo.size())
#         print('\n emo concat size : ',speaker_specific, utt_id)
        return speaker_specific, utt_id
    
    def applyGRU(self, speaker_specific, utt_id, seq_len):
        speaker_output = torch.zeros(seq_len, self.output_size)  
        for sp_idx in range(len(utt_id)):
            h0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(device)
            out, hn = self.gru_list[sp_idx](speaker_specific[sp_idx], h0)
            for uid in range(utt_id[sp_idx].size(0)):
                speaker_output[utt_id[sp_idx][uid]] = out[0][uid].clone()
        return speaker_output

    def forward(self, x, speakers):
        batch_size = x.size(0)
        seq_len    = x.size(1)
        outputs = []
        print('x shape : ', x.shape)
        for i in range(batch_size):
            speaker_specific, utt_id = self.segregateEmotions(x[i], speakers[i])
            out = self.applyGRU(speaker_specific, utt_id, seq_len)
            outputs.append(out)
        
        final_output = torch.cat([outputs[i].unsqueeze(2) for i in range(len(outputs))], 2).permute(2,0,1)
        
        return final_output

In [75]:
class FinalModel(L.LightningModule):
        def __init__(self, 
                input_size_1, hidden_size_1, num_layers_1, output_size_1,      # module 1    
                input_size_2, n_head_2, dm_ff_2, dp_2, num_layers_2, act_fn_2, # module 2
                input_size_3, n_head_3, dm_ff_3, dp_3, num_layers_3, act_fn_3, # module 3
                input_size_4, hidden_size_4, num_layers_4, output_size_4,      # module 4
                input_size_5, n_head_5, dm_ff_5, dp_5, num_layers_5, act_fn_5, # module 5
                input_size_6, hidden_size_6, num_layers_6, output_size_6,      # module 6
                fc1_out, fc2_out, fc3_out, fc4_out, fc5_out, dp, #masking = False            # final Model parameters
                ):
                super(FinalModel, self).__init__()

                #self.masking = masking

                self.module1 = Module1GRU(input_size = input_size_1, num_layers = num_layers_1, hidden_size = hidden_size_1, output_size = output_size_1)
                self.module2 = Module2TransformerEnc(input_size = input_size_2, n_head = n_head_2, dim_ff = dm_ff_2, dp = dp_2, num_layers = num_layers_2, act_fn = act_fn_2)
                self.module3 = Module3TransformerEnc(input_size = input_size_3, n_head = n_head_3, dim_ff = dm_ff_3, dp = dp_3, num_layers = num_layers_3, act_fn = act_fn_3)
                self.module4 = Module4GRU(input_size = input_size_4, num_layers = num_layers_4, hidden_size = hidden_size_4, output_size = output_size_4)
                self.module5 = Module5TransformerEnc(input_size = input_size_5, n_head = n_head_5, dim_ff = dm_ff_5, dp = dp_5, num_layers = num_layers_5, act_fn = act_fn_5)
                self.module6 = Module6GRU(input_size = input_size_6, num_layers = num_layers_6, hidden_size = hidden_size_6, output_size = output_size_6)

                
                self.sigmoid = nn.Sigmoid()
                self.fc1 = nn.Linear(input_size_2+input_size_3+input_size_5, fc1_out)
                self.classification = nn.Sequential(
                        nn.Linear(output_size_1 + fc1_out + output_size_4 + output_size_6, fc2_out),
                        nn.ReLU(),
                        nn.Dropout(dp), 
                        nn.Linear(fc2_out, fc3_out),
                        nn.ReLU(),
                        nn.Dropout(dp),
                        nn.Linear(fc3_out, fc4_out),
                        nn.ReLU(),
                        nn.Dropout(dp),
                        nn.Linear(fc4_out, fc5_out),
                        # nn.Sigmoid()
                )

        def forward(self, x):
                speaker = x['speaker']
                emotion = x['emotion']
                sentence_embeddings = x['sentence_embeddings']
                valence = x['valence']

                out1 = self.module1(emotion, speaker)
                out2 = self.module2(sentence_embeddings)
                out3 = self.module3(speaker)
                out4 = self.module4(emotion)
                out5 = self.module5(valence)
                out6 = self.module6(valence, speaker)

                out146 = torch.cat((out1, out4, out6), 2)
                out234 = F.relu(self.fc1(torch.cat((out2, out3, out5), 2)))
                
                out123456 = torch.cat((out146, out234), 2)

                out_tensor = torch.zeros(out123456.size(0), out123456.size(1))

                for batch_idx in range(out123456.size(0)):
                        for seq_idx in range(out123456.size(1)):
                                # Obtén la salida de la red para el punto de datos actual
                                op = self.classification(out123456[batch_idx][seq_idx])
                                op = self.sigmoid(op)

                                # Asumiendo que 'op' es un tensor unidimensional con la salida de la clasificación
                                # Aquí, seleccionamos el primer elemento ya que op debería ser un scalar después de la sigmoid
                                out_tensor[batch_idx, seq_idx] = op  # Ajusta esto según la estructura real de 'op'

                # Retorna el tensor de salida en lugar de la lista
                return out_tensor


        def training_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                print('y_hat shape : ', y_hat)
                print('y shape : ', y['labels'])
                loss = F.binary_cross_entropy(y_hat, y['labels'])
                self.log('train_loss', loss)
                return loss
        
        def validation_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                loss = F.binary_cross_entropy(y_hat, y['labels'])
                self.log('val_loss', loss)
                return loss
        
        def configure_optimizers(self):
                optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
                scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=5)
                return {"optimizer": optimizer, "lr_scheduler": scheduler}
        
        def test_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                loss = F.binary_cross_entropy(y_hat, y['labels'])
                self.log('test_loss', loss)
                return loss


In [76]:
model = FinalModel(
        input_size_1, hidden_size_1, num_layers_1, output_size_1,      # module 1    
        input_size_2, n_head_2, dm_ff_2, dp_2, num_layers_2, act_fn_2, # module 2
        input_size_3, n_head_3, dm_ff_3, dp_3, num_layers_3, act_fn_3, # module 3
        input_size_4, hidden_size_4, num_layers_4, output_size_4,      # module 4
        input_size_5, n_head_5, dm_ff_5, dp_5, num_layers_5, act_fn_5, # module 5
        input_size_6, hidden_size_6, num_layers_6, output_size_6,      # module 6
        fc1_out, fc2_out, fc3_out, fc4_out, fc5_out, dp=0.2, #masking = False            # final Model parameters
        )

In [83]:
trainer = L.Trainer(max_epochs=EPOCHS)
trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs



  | Name           | Type                  | Params
---------------------------------------------------------
0 | module1        | Module1GRU            | 284 K 
1 | module2        | Module2TransformerEnc | 27.6 M
2 | module3        | Module3TransformerEnc | 5.8 M 
3 | module4        | Module4GRU            | 1.3 K 
4 | module5        | Module5TransformerEnc | 1.5 M 
5 | module6        | Module6GRU            | 242 K 
6 | sigmoid        | Sigmoid               | 0     
7 | fc1            | Linear                | 855 K 
8 | classification | Sequential            | 1.0 M 
---------------------------------------------------------
37.3 M    Trainable params
0         Non-trainable params
37.3 M    Total params
149.266   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]x shape :  torch.Size([64, 23, 7])
x shape :  torch.Size([64, 23, 7])


RuntimeError: all elements of input should be between 0 and 1