In [4]:
# ! pip install sentence-transformers

In [5]:
import torch
from torch import nn
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
# from sklearnex import patch_sklearn
# patch_sklearn()
# from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

import gc
import json
import pickle
import numpy as np    
import pandas as pd
# from tqdm.notebook import tqdm
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




In [6]:
# PATH="/kaggle/input/"
# OUTPATH='/kaggle/working/'
# PATH="D:\\ghd\\NLP-Assignments\\Assignment4\\data\\"
PATH="data/"
OUTPATH='output/'
BATCH_SIZE=256
MAX_LENGTH=256
MAX_UTTERANCES=10
ROBERTA_LABELS=100
EPOCHS=1
EOS='</s>'
SEP='[SEP]'
# SOS='o'

torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "roberta-base"

In [7]:


with open(PATH+"train_file.json") as f:
    data = json.load(f)
    count=0
    trigcount=0
    trigs=0
    for row in data:
        speakers=row['speakers']
        emotions=row['emotions']
        utterances=row['utterances']
        
        # print(speakers,emotions,utterances)
        # trim in front
        while len(utterances)>MAX_UTTERANCES:
            utterances.pop(0)
            speakers.pop(0)
            emotions.pop(0)
        
        # print(speakers,emotions,utterances)
        # pad
        while len(utterances)<MAX_UTTERANCES:
            utterances.insert(0,EOS)
            speakers.insert(0,EOS)
            emotions.insert(0,EOS)
        
        # print(speakers[1],emotions[1],utterances[1])
        text=[]
        t=""
        for i in range(10):
            if(utterances[i]==EOS):
                text.append(utterances[i])
                t=EOS
                continue
            if(t==EOS):
                t=""
            text.append(t+SEP+" "+f"{speakers[i]}:{utterances[i]}:{emotions[i]} ")
            t+=f"{speakers[i]}:{utterances[i]}:{emotions[i]} "
        




In [8]:

def get_train(data):
    x=[]
    y=[]
    for row in data:
        speakers=row['speakers']
        emotions=row['emotions']
        utterances=row['utterances']
        triggers=row['triggers']
        # print(speakers,emotions,utterances)
        # trim in front
        while len(utterances)>MAX_UTTERANCES:
            utterances.pop(0)
            speakers.pop(0)
            emotions.pop(0)
            triggers.pop(0)
        
        # print(speakers,emotions,utterances)
        # pad
        while len(utterances)<MAX_UTTERANCES:
            utterances.insert(0,EOS)
            speakers.insert(0,EOS)
            emotions.insert(0,EOS)
            triggers.insert(0,0)
        # print(speakers[1],emotions[1],utterances[1])
        text=[]
        t=""
        for i in range(10):
            if(utterances[i]==EOS):
                text.append(utterances[i])
                t=EOS
                continue
            if(t==EOS):
                t=""
            text.append(t+SEP+" "+f"{speakers[i]}:{utterances[i]}:{emotions[i]} ")
            t+=f"{speakers[i]}:{utterances[i]}:{emotions[i]} "
        x.append(text)
        trigs=[]
        for i in triggers:
            if(i is None):
                trigs.append(0)
            else:
                trigs.append(int(i))
        y.append(trigs)
    
    return x,y

def get_eval(data):
    x=[]
    y=[]
    max_len=max([len(i["triggers"]) for i in data])
    # left pad triggers

    for row in data:
        speakers=row['speakers']
        emotions=row['emotions']
        utterances=row['utterances']
        triggers=row['triggers']
        # trim in front
        while len(triggers)>max_len:
            triggers.pop(0)
        # left pad triggers
        while len(triggers)<max_len:
            triggers.insert(0,0)


        # print(speakers,emotions,utterances)
        # trim in front
        while len(utterances)>MAX_UTTERANCES:
            utterances.pop(0)
            speakers.pop(0)
            emotions.pop(0)
        
        # print(speakers,emotions,utterances)
        # pad
        while len(utterances)<MAX_UTTERANCES:
            utterances.insert(0,EOS)
            speakers.insert(0,EOS)
            emotions.insert(0,EOS)
        # print(speakers[1],emotions[1],utterances[1])
        text=[]
        t=""
        for i in range(10):
            if(utterances[i]==EOS):
                text.append(utterances[i])
                t=EOS
                continue
            if(t==EOS):
                t=""
            text.append(t+SEP+" "+f"{speakers[i]}:{utterances[i]}:{emotions[i]} ")
            t+=f"{speakers[i]}:{utterances[i]}:{emotions[i]} "
        x.append(text)
        trigs=[]
        for i in triggers:
            if(i is None):
                trigs.append(0)
            else:
                trigs.append(int(i))
        y.append(trigs)
    
    return x,y


def load_data(PATH):
    with open(PATH) as f:
        data = json.load(f)
        return data
    
label_encoding = {
    "S":0,
    "surprise":1,
    "fear":2,
    "neutral":3,
    "sadness":4,
    "disgust":5,
    "anger":6,
    "joy":7
}
data = load_data(PATH+"train_file.json")
data_val = load_data(PATH+"val_file.json")
x_train,y_train = get_train(data)
x_val,y_val = get_eval(data_val)

In [9]:
max_len=0
for i in x_train:
    for j in i:
        max_len=max(max_len,len(j.split()))

print(max_len)


169


In [10]:
x_val[0],y_val[0]

(["[SEP] Joey:Y'know what you should do, you should get her one of those um, barium enemas.:neutral ",
  "Joey:Y'know what you should do, you should get her one of those um, barium enemas.:neutral [SEP] Joey:Those are dead serious.:neutral ",
  "Joey:Y'know what you should do, you should get her one of those um, barium enemas.:neutral Joey:Those are dead serious.:neutral [SEP] Chandler:All right. Look, I'm gonna go in here, and you don't buy me anything ever.:anger ",
  "Joey:Y'know what you should do, you should get her one of those um, barium enemas.:neutral Joey:Those are dead serious.:neutral Chandler:All right. Look, I'm gonna go in here, and you don't buy me anything ever.:anger [SEP] Joey:No, no, you can't, you can't, okay, you can't, you can't buy her pearls, you just can't, you can't, you can't.:fear ",
  "Joey:Y'know what you should do, you should get her one of those um, barium enemas.:neutral Joey:Those are dead serious.:neutral Chandler:All right. Look, I'm gonna go in her

In [11]:
np.array(x_train).shape

(6740, 10)

In [12]:
len(x_train),len(y_train),len(x_val),len(y_val)

(6740, 6740, 843, 843)

In [13]:
tokenizer = RobertaTokenizer.from_pretrained(model_id)

In [14]:
# class

In [15]:
# len(x_train)
# len(x_train[0])
# x_train = np.array(x_train)
# x_train
y_train[0],y_train[4]

([0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [16]:
class Dataset(Dataset):
    def __init__(self, data, tokenizer,labels):
        self.data=data
        self.tokenizer = tokenizer
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
#         print(self.data[idx])
#         dat = "".join(self.data[idx])
        x_tokenized = self.tokenizer(self.data[idx],max_length=MAX_LENGTH,truncation=True,return_tensors="pt",padding="max_length")
        input_ids = x_tokenized["input_ids"].squeeze(0)
        attention_mask = x_tokenized["attention_mask"].squeeze(0)
        return input_ids,attention_mask,torch.tensor(self.labels[idx]).squeeze(0)

In [17]:
train_dataset = Dataset(x_train,tokenizer,y_train)
val_dataset = Dataset(x_val,tokenizer,y_val)

In [18]:
train_dataset[0]

(tensor([[    0,     2,     2,  ...,     1,     1,     1],
         [    0,     2,     2,  ...,     1,     1,     1],
         [    0,     2,     2,  ...,     1,     1,     1],
         ...,
         [    0, 17297,  3540,  ...,     1,     1,     1],
         [    0, 17297,  3540,  ...,     1,     1,     1],
         [    0, 17297,  3540,  ...,     1,     1,     1]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0]))

In [19]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=False)
val_dataloader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False)

Roberta output should be of size BATCH_SIZExlabels, we need linear layer to output BATCH_SIZExlabels as emotions in one way or another after we will do softmax. Now manage this. 

In [20]:
# class my_model(nn.Module):
#     def __init__(self):
#         super(my_model,self).__init__()
# #         self.lstm_model = torch.nn.
# #         self.roberta_model = roberta_model
# #         self.num_labels = num_labels
#         self.LSTM = nn.LSTM(input_size=128,hidden_size=2, batch_first=True)
#         self.linear = nn.Linear(2,8)
#         self.softmax = nn.Softmax()
#     def forward(self,input_ids):
# #         self.lstm = torch.nn
#         lstm_output,_    =self.LSTM(input_ids)
# #         print(lstm_output)
# #         linear_output = self.linear(lstm_output)
#         linear_outputs = []
#         for i in range(lstm_output.size(1)):
#             linear_output = self.linear(lstm_output[:,i,:])
#             softmax_output = self.softmax(linear_output)
#             softmax_outputs.append(softmax_output.unsqueeze(1))
        
        
        
#         # Concatenate the list of linear outputs along the time step dimension
# #         linear_outputs_tensor = torch.cat(linear_outputs, dim=1)
#         return torch.tensor(softmax_outputs,device=device)

In [21]:
class MyModel(nn.Module):
    def __init__(self, input_size=MAX_LENGTH, hidden_size=MAX_UTTERANCES, num_linear_layers=MAX_UTTERANCES, linear_size=2):
        super(MyModel, self).__init__()
        
  
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        
    
        self.linear_layers = nn.ModuleList([nn.Linear(hidden_size, linear_size) for _ in range(num_linear_layers)])

        self.softmax_layers = nn.ModuleList([nn.Softmax(dim=1) for _ in range(num_linear_layers)])
    def forward(self, x):
 
        lstm_out, _ = self.lstm(x)
#         print("Output shape of lstm = ",lstm_out.shape)

        softmax_outputs = []
        for i in range(len(self.linear_layers)):
            linear_output = self.linear_layers[i](lstm_out[:, -1, :])

            softmax_output = self.softmax_layers[i](linear_output)
            softmax_outputs.append(softmax_output.float())
           
        stacked_tensor = torch.stack(softmax_outputs, dim=1)
        
        return stacked_tensor.clone().detach().requires_grad_(True)


In [22]:
model = MyModel()
model.to(device)


MyModel(
  (lstm): LSTM(256, 10, batch_first=True)
  (linear_layers): ModuleList(
    (0-9): 10 x Linear(in_features=10, out_features=2, bias=True)
  )
  (softmax_layers): ModuleList(
    (0-9): 10 x Softmax(dim=1)
  )
)

In [23]:
import torch.nn as nn

criterion = nn.functional.cross_entropy

def metrics(y_true, y_pred):
    acc=accuracy_score(y_true, y_pred)
    f1_macro=f1_score(y_true, y_pred, average='macro')  
    f1_scores=f1_score(y_true, y_pred, average="weighted")
    return acc, f1_macro, f1_scores


In [29]:
def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False):

        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        batch_size = input_ids.size(0)
        utt_size = input_ids.size(1)

        outputs  = model(input_ids.float())

        outputs=outputs.view(batch_size*utt_size,-1)
        labels=labels.view(batch_size*utt_size)
        
        # print(outputs.shape,labels.shape)
        print(outputs)
        loss = criterion(outputs, labels)

        loss = loss.to(device)
#         print("Loss = ",loss.item)
        losses+=loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        for name, param in model.named_parameters():
            if param.grad is not None:
                print(f'Gradient - {name}: {param.grad.norm()}')

        del input_ids
        del attention_mask
        del labels
        del outputs

        gc.collect()
        torch.cuda.empty_cache()
        # break
#     print("BATCH FINISHED")
    x = losses /len(train_dataloader)
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x




def evaluate(model,val_dataloader,name,metric):

    with torch.no_grad():
        model.eval()
        losses = 0

        preds = []
        actuals = []
        for batch in tqdm(val_dataloader, desc=name,total=len(val_dataloader), leave=False):

                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)

                batch_size = input_ids.size(0)
                utt_size = labels.size(1)

                outputs  = model(input_ids.float())

                size_diff = labels.size(1) - outputs.size(1)

                paddings=torch.zeros(batch_size,size_diff,2).to(device)


                # print(outputs.shape,paddings.shape)
                predicted = torch.cat((outputs,paddings),dim=1)

                predicted=predicted.view(batch_size*utt_size,-1)
                labels=labels.view(batch_size*utt_size)
                
                # print(outputs.shape,labels.shape)
                loss = criterion(predicted, labels)

                loss = loss.to(device)
        #         print("Loss = ",loss.item)
                losses+=loss.item()
    
                _, pred = torch.max(predicted, 1)

                # Flatten the predictions and targets
                predicted_flat = pred.view(-1)
                targets_flat = labels.view(-1)

                preds.extend(predicted_flat.cpu().numpy())
                actuals.extend(targets_flat.cpu().numpy())

                del input_ids
                del attention_mask
                del labels
                del predicted
                # del predicted_labels
                gc.collect()
                torch.cuda.empty_cache()
            # break
        # print(losses,len(val_dataloader))
        x = losses / len(val_dataloader)
        tqdm.write(f"Avg {name} Loss: {x}")
        gc.collect()
        torch.cuda.empty_cache()
        acc,macro,f1 = metric(actuals,preds)
        tqdm.write(f"Avg {name} Accuracy: {acc}, F1 Macro: {macro}, F1 Scores: {f1}")
        return x,preds

In [30]:
optim = torch.optim.AdamW(model.parameters(),lr=10)

In [31]:
for name, param in model.named_parameters():
    print(f'Parameter - {name}: {param.shape}, grad = {param.requires_grad}')

Parameter - lstm.weight_ih_l0: torch.Size([40, 256]), grad = True
Parameter - lstm.weight_hh_l0: torch.Size([40, 10]), grad = True
Parameter - lstm.bias_ih_l0: torch.Size([40]), grad = True
Parameter - lstm.bias_hh_l0: torch.Size([40]), grad = True
Parameter - linear_layers.0.weight: torch.Size([2, 10]), grad = True
Parameter - linear_layers.0.bias: torch.Size([2]), grad = True
Parameter - linear_layers.1.weight: torch.Size([2, 10]), grad = True
Parameter - linear_layers.1.bias: torch.Size([2]), grad = True
Parameter - linear_layers.2.weight: torch.Size([2, 10]), grad = True
Parameter - linear_layers.2.bias: torch.Size([2]), grad = True
Parameter - linear_layers.3.weight: torch.Size([2, 10]), grad = True
Parameter - linear_layers.3.bias: torch.Size([2]), grad = True
Parameter - linear_layers.4.weight: torch.Size([2, 10]), grad = True
Parameter - linear_layers.4.bias: torch.Size([2]), grad = True
Parameter - linear_layers.5.weight: torch.Size([2, 10]), grad = True
Parameter - linear_lay

In [32]:
for i in range(1,EPOCHS+1):
    train_epoch(model,optim,i)
    evaluate(model,val_dataloader,"Validation",metrics)

Epoch:1:   0%|          | 0/27 [00:00<?, ?it/s]

tensor([[0.3952, 0.6048],
        [0.5759, 0.4241],
        [0.6257, 0.3743],
        ...,
        [0.3173, 0.6827],
        [0.7355, 0.2645],
        [0.5712, 0.4288]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:   4%|▎         | 1/27 [00:01<00:29,  1.12s/it]

tensor([[0.3671, 0.6329],
        [0.5233, 0.4767],
        [0.4960, 0.5040],
        ...,
        [0.4822, 0.5178],
        [0.5836, 0.4164],
        [0.6419, 0.3581]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:   7%|▋         | 2/27 [00:01<00:24,  1.03it/s]

tensor([[0.6209, 0.3791],
        [0.4539, 0.5461],
        [0.5439, 0.4561],
        ...,
        [0.4481, 0.5519],
        [0.6921, 0.3079],
        [0.5744, 0.4256]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  11%|█         | 3/27 [00:02<00:22,  1.08it/s]

tensor([[0.5859, 0.4141],
        [0.4671, 0.5329],
        [0.6469, 0.3531],
        ...,
        [0.3695, 0.6305],
        [0.6404, 0.3596],
        [0.4753, 0.5247]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  15%|█▍        | 4/27 [00:03<00:20,  1.12it/s]

tensor([[0.3612, 0.6388],
        [0.6786, 0.3214],
        [0.6260, 0.3740],
        ...,
        [0.4936, 0.5064],
        [0.6797, 0.3203],
        [0.4736, 0.5264]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  19%|█▊        | 5/27 [00:04<00:19,  1.11it/s]

tensor([[0.2963, 0.7037],
        [0.5632, 0.4368],
        [0.5479, 0.4521],
        ...,
        [0.2949, 0.7051],
        [0.6173, 0.3827],
        [0.4892, 0.5108]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  22%|██▏       | 6/27 [00:05<00:18,  1.12it/s]

tensor([[0.3655, 0.6345],
        [0.5155, 0.4845],
        [0.6113, 0.3887],
        ...,
        [0.5216, 0.4784],
        [0.7749, 0.2251],
        [0.5869, 0.4131]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  26%|██▌       | 7/27 [00:06<00:18,  1.11it/s]

tensor([[0.2622, 0.7378],
        [0.4570, 0.5430],
        [0.4555, 0.5445],
        ...,
        [0.3589, 0.6411],
        [0.7133, 0.2867],
        [0.4565, 0.5435]], device='cuda:0', grad_fn=<ViewBackward0>)


Epoch:1:  30%|██▉       | 8/27 [00:07<00:17,  1.11it/s]

: 

In [None]:
# gc.collect()
# torch.cuda.empty_cache()
# for epoch in range(1, EPOCHS+1):
#     train_loss = train_epoch(model, optimizer,epoch)
#     val_loss,all_labels = evaluate(model,val_dataloader=val_dataloader,name='Val')
#     if(epoch%2==0):
#         torch.save(model, f"{OUTPATH}modelM1_epoch{epoch}.pth")
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}"))

In [None]:
torch.save(model, f"{OUTPATH}modelM3.pth")
torch.save(tokenizer, f"{OUTPATH}tokenizerM3.pth")

In [None]:
# load model
loaded_model = torch.load(f"{OUTPATH}modelM3.pth")
loaded_tokenizer = torch.load(f"{OUTPATH}tokenizerM3.pth")

# test
x_test, y_test = get_eval(load_data(PATH+"val_file.json"))
test_dataset = Dataset(x_test,loaded_tokenizer,y_test)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False)
loss,preds=evaluate(loaded_model,test_dataloader,"Test",metrics)


                                                   

Avg Test Loss: 0.6939282864332199
Avg Test Accuracy: 0.7346433544793439, F1 Macro: 0.4238919486812682, F1 Scores: 0.7986921870648321


In [None]:
y_test_list = np.array(y_test).flatten()

preds=np.array(preds).flatten()

In [None]:
dic1={}
dic2={}
for i in range(len(y_test_list)):
    if y_test_list[i] in dic1:
        dic1[y_test_list[i]]+=1
    else:
        dic1[y_test_list[i]]=1
    if preds[i] in dic2:
        dic2[preds[i]]+=1
    else:
        dic2[preds[i]]=1

dic1,dic2

({0: 18282, 1: 1107}, {1: 4042, 0: 15347})

In [None]:
print(classification_report(y_test_list, preds))
f1_scores = f1_score(y_test_list,preds, average=None)

# Print F1 score for each label
for label, f1 in enumerate(f1_scores):
    print(f"F1 score for label {label}: {f1}")

              precision    recall  f1-score   support

           0       0.93      0.78      0.85     18282
           1       0.00      0.00      0.00      1107

    accuracy                           0.73     19389
   macro avg       0.46      0.39      0.42     19389
weighted avg       0.88      0.73      0.80     19389

F1 score for label 0: 0.8470070474887745
F1 score for label 1: 0.0007768498737618957
