In [87]:
import pandas as pd
import os
import numpy as np
import torch

device = torch.device('cpu')

masac_data_filepath = os.path.join('Data','MaSaC', 'MaSaC_train_efr.csv')
masac_data = pd.read_csv(masac_data_filepath)

In [12]:
masac_data.head(20)

Unnamed: 0,Dialogue_Id,Speaker,Emotion_name,Utterance,Annotate(0/1)
0,0.0,monisha,neutral,theek se wajan karana,0.0
1,0.0,sahil,neutral,monisha,0.0
2,0.0,monisha,anger,"abhi ayi. yeh, ye bhi rakho.",0.0
3,0.0,sahil,neutral,"monisha, iss churidaar me nara nhi hai.",1.0
4,0.0,monisha,neutral,to kisee aur churidaar ka nara nikalke dalo na...,0.0
5,,,,,
6,0.0,monisha,neutral,theek se wajan karana,0.0
7,0.0,sahil,neutral,monisha,0.0
8,0.0,monisha,anger,"abhi ayi. yeh, ye bhi rakho.",0.0
9,0.0,sahil,neutral,"monisha, iss churidaar me nara nhi hai.",0.0


In [13]:
mask = masac_data["Dialogue_Id"].values

flips = []
flip_id = 0
for m in mask:
    if (np.isnan(m)):
        flip_id += 1
    flips.append(flip_id)

masac_data['flip'] = flips

masac_data.head(20)

Unnamed: 0,Dialogue_Id,Speaker,Emotion_name,Utterance,Annotate(0/1),flip
0,0.0,monisha,neutral,theek se wajan karana,0.0,0
1,0.0,sahil,neutral,monisha,0.0,0
2,0.0,monisha,anger,"abhi ayi. yeh, ye bhi rakho.",0.0,0
3,0.0,sahil,neutral,"monisha, iss churidaar me nara nhi hai.",1.0,0
4,0.0,monisha,neutral,to kisee aur churidaar ka nara nikalke dalo na...,0.0,0
5,,,,,,1
6,0.0,monisha,neutral,theek se wajan karana,0.0,1
7,0.0,sahil,neutral,monisha,0.0,1
8,0.0,monisha,anger,"abhi ayi. yeh, ye bhi rakho.",0.0,1
9,0.0,sahil,neutral,"monisha, iss churidaar me nara nhi hai.",0.0,1


In [14]:
import pickle

with open('./Data/MaSaC/sent2emb.pickle', 'rb') as f:
    sent2emb = pickle.load(f)

In [32]:
import torch
from utilities import preprocess_text
index = 0
this_index_data = masac_data[masac_data['flip'] == index]
triggers = torch.tensor(this_index_data['Annotate(0/1)'].values)
all_utt = this_index_data['Utterance'].apply(preprocess_text).values

torch.stack([sent2emb[utt] for utt in all_utt]).shape

torch.Size([5, 768])

In [76]:
masac_data = masac_data.dropna()

In [79]:
# [type(x) for x in masac_data['Utterance'].values]
# masac_data['Utterance'].values[5]

In [80]:
from torch.utils.data import Dataset, IterableDataset
import torch
from utilities import preprocess_text

class MaSaCFlipDataset(Dataset):
    def __init__(self, masac_data: pd.DataFrame):
        self.data = masac_data
    
    def __getitem__(self, index):
        this_index_data = self.data[self.data['flip'] == index]
        triggers = torch.tensor(this_index_data['Annotate(0/1)'].values)
        try:
            all_utt = this_index_data['Utterance'].apply(preprocess_text).values
        except Exception as e:
            print()
            raise e
        emb_all_utt = [sent2emb[utt] for utt in all_utt]
        emb_last_utt = torch.tensor(sent2emb[all_utt[-1]])
        return [[torch.cat([e, emb_last_utt]), trigger] for e, trigger in zip(emb_all_utt, triggers)]
        
    
    def __len__(self):
        return max(self.data['flip'])
        
        
dataset = MaSaCFlipDataset(masac_data=masac_data)

In [102]:
from torch.utils.data import random_split
def split_train_val(dataset, val_ratio=0.2):
    n_dataset = len(dataset)
    split_index = int(n_dataset*val_ratio)
    return [dataset[i] for i in range(split_index)], [dataset[i] for i in range(split_index, n_dataset)]

train_dataset, val_dataset = split_train_val(dataset, val_ratio=0.2)
# generator1 = torch.Generator(device=device).manual_seed(42)
# train_dataset, val_dataset = random_split(dataset, [0.8, 0.2], generator=generator1)

  return func(*args, **kwargs)


In [45]:
from torch import nn
import torch.nn.functional as F
class SimpleMLPBaseline(nn.Module):
    def __init__(self, hidden_dims: list):
        self.hidden_dims = hidden_dims
        self.hiddens = [nn.LazyLinear(h) for h in hidden_dims]
    
    def forward(self, dataset):
        output = F.relu(self.hiddens[0](dataset))
        for i in range(1, len(self.hidden_dims)-1):
            output = F.relu(self.hiddens[i](output))
        output = F.sigmoid(self.hiddens[0](dataset))
        return output

model = SimpleMLPBaseline([1024, 768, 1])



In [None]:
from torch import optim
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir='./tensorboard_logs/efr_mlp_baseline_exp_1')
def train_batch(model, dataset_batch, loss_fn, optimizer):
    optimizer.zero_grad()
    probs = model(dataset_batch[0])
    loss = loss_fn(probs, dataset_batch[1])
    loss.backward()
    optimizer.step()
    return loss

def val_batch(model, dataset_batch, loss_fn, optimizer):
    with torch.inference_mode():
        probs = model(dataset_batch[0])
        loss = loss_fn(probs, dataset_batch[1])
        return loss

def train(model, train_dataset, val_dataset, epochs=10):
    loss_fn = nn.NLLLoss()
    val_losses = []
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, betas=(0.9, 0.98), weight_decay=1e-9)
    for epoch in range(epochs):
        for b_idx, batch in enumerate(dataset):
            train_loss = train_batch(model=model, dataset_batch=batch, loss_fn=loss_fn, optimizer=optimizer)
            writer.add_scalar('Train Loss', train_loss, b_idx)
        for b_idx, batch in enumerate(val_dataset):
            val_loss  = val_batch(model)
            val_losses.append(val_loss)
        val_loss = np.mean(val_losses)
        writer.add_scalar('Train Loss', val_loss, epoch)
        
        
        
        