In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score
from tqdm import tqdm, trange

In [20]:
num_classes = 7

In [21]:
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = self.process(data, tokenizer)
        self.max_len = max_len

    def __len__(self):
        return len(self.data['input'])

    def __getitem__(self, index):
        #tokenize the input
        input = self.data['input'][index]
        target = self.data['target'][index]
        encoding = self.tokenizer(input, return_tensors='pt', padding='max_length', max_length=self.max_len, truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'target': torch.tensor(target)}
        
    def process(self, data, tokenizer):
        new_data = {'input': [], 'target': []}
        for i in range(len(data)):
            newinput = data.loc[i].copy()
            stringlist = []
            for j in range(len(newinput['utterances'])):
                stringlist.append(newinput['speakers'][j] + ': ' + newinput['utterances'][j])
            for j in range(len(newinput['utterances'])):
                #join till jth utterance
                temp = ' '.join(stringlist[:j])
                temp += '</s></s>'
                #add jth utterance
                temp += newinput['speakers'][j] + ': ' + newinput['utterances'][j]
                temp += '</s></s>'
                #add later utterances
                temp += ' '.join(stringlist[j+1:])
                temp = '<s> ' + temp + '</s>'
                new_data['input'].append(temp)
                target = [0]*num_classes
                target[emotion_to_idx[newinput['emotions'][j]]] = 1.0
                new_data['target'].append(target)
        return new_data

In [22]:
def collate_fn(batch):
    input_ids = []
    attention_masks = []
    targets = []
    for i in batch:
        input_ids.append(i['input_ids'])
        attention_masks.append(i['attention_mask'])
        targets.append(i['target'])
    input_ids = torch.stack(input_ids, dim=0)
    attention_masks = torch.stack(attention_masks, dim=0)
    targets = torch.stack(targets, dim=0)
    return {
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': targets
    }

In [23]:
# input csv
test_data = pd.read_json('/kaggle/input/nlp-a4infer/val_file.json')

In [24]:
test_data['utterances'] = test_data['utterances'].apply(lambda x: '@'.join(x))
test_data['speakers'] = test_data['speakers'].apply(lambda x: '@'.join(x))

In [25]:
test_data = test_data.drop_duplicates(subset=['speakers', 'utterances'], keep='first')

In [26]:
test_data['utterances'] = test_data['utterances'].apply(lambda x: x.split('@'))
test_data['speakers'] = test_data['speakers'].apply(lambda x: x.split('@'))

In [27]:
test_data = test_data.reset_index(drop=True)

In [28]:
unicode_mapping = {}
# unicode_mapping['\u0085'] = '...' 
# unicode_mapping['\u0091'] = "'"
# unicode_mapping['\u0092'] = "'"
# unicode_mapping['\u0093'] = '"'
# unicode_mapping['\u0094'] = '"'
# unicode_mapping['\u0097'] = '--'

# unicode_mapping['\u2014'] = '--'
# unicode_mapping['\u2019'] = "'"
# unicode_mapping['\u2026'] = '...'

# unicode_mapping['\u00e9'] = 'e'

unicode_mapping['\x85'] = '...' 
unicode_mapping['\x91'] = "'"
unicode_mapping['\x92'] = "'"
unicode_mapping['\x93'] = '"'
unicode_mapping['\x94'] = '"'
unicode_mapping['\x97'] = '--'

unicode_mapping['\u2014'] = '--'
unicode_mapping['\u2019'] = "'"
unicode_mapping['\u2026'] = '...'

unicode_mapping['\xe9'] = 'e'

In [29]:
# replacing unicode characters in the data
def clean_utterance(utterance_list):
    '''	
    This function takes a list of utterances and replaces the unicode with the proper characters.
    '''
    cleaned_utterances_list = []
    for utterance in utterance_list:
        for key in unicode_mapping:
            utterance = utterance.replace(key, unicode_mapping[key])
        cleaned_utterances_list.append(utterance)
    return cleaned_utterances_list

In [30]:
# clean the val data
n_val = len(test_data['utterances'])
for i in test_data.index:
    temp = test_data.loc[i].copy()
    cleaned_utterances = clean_utterance(temp['utterances'])
    temp['utterances'] = cleaned_utterances
    test_data.loc[i] = temp

In [31]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Model M1

In [33]:
class Model(nn.Module):
    #model consists of a EmoBERTa model and a linear layer for sequence labeling task
    def __init__(self, num_classes):
        super(Model, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)
        out = outputs[0]
        out = out[:, 0, :]
        out = self.fc(out)
        return out
    
emotion_to_idx = {'surprise': 0, 'fear': 1, 'sadness': 2, 'disgust': 3, 'anger': 4, 'neutral': 5, 'joy': 6}
idx_to_emotion = {0: 'surprise', 1: 'fear', 2: 'sadness', 3: 'disgust', 4: 'anger', 5: 'neutral', 6: 'joy'}

    

In [34]:
test_dataset = Dataset(test_data, tokenizer, 128)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

In [35]:
# load model M1

model = Model(num_classes)
# model.load_state_dict(torch.load('M1.pth', map_location=torch.device('cpu')))
# loading the model on the device if device is gpu
model.load_state_dict(torch.load('/kaggle/input/nlpa4-withmodels/M1.pth', map_location=device))
model.to(device)


criterion = nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
def evaluate(model, val_loader, criterion, num_classes):   
    model.eval()
    val_loss = 0
    with torch.no_grad():
        predictions = []
        true_labels = []
        for i, data in enumerate(tqdm(val_loader)):
            input_ids = data['input_ids'].to(device)
            attention_masks = data['attention_masks'].to(device)
            labels = data['labels'].to(device) #labels are one-hot encoded
            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs.view(-1, num_classes), labels)
            val_loss += loss.item()
            predictions.append(torch.argmax(outputs, dim=1))
            true_labels.append(torch.argmax(labels, dim=1))
        predictions = torch.cat(predictions, dim=0)
        true_labels = torch.cat(true_labels, dim=0)
        # print(predictions)
        # print(true_labels)
        f1_micro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
        f1_macro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
        f1_weighted = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
        val_loss /= len(val_loader)
        # val_losses.append(val_loss)
        # print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss}, F1 Micro: {f1_micro}, F1 Macro: {f1_macro}, F1 Weighted: {f1_weighted}')
    return val_loss, f1_micro, f1_macro, f1_weighted

In [37]:
loss, f1_micro, f1_macro, f1_weighted = evaluate(model, test_loader, criterion, num_classes)

print(f'Loss: {loss}, F1 Micro: {f1_micro}, F1 Macro: {f1_macro}, F1 Weighted: {f1_weighted}')

100%|██████████| 874/874 [00:33<00:00, 25.84it/s]

Loss: 0.3063106957990335, F1 Micro: 0.884549356223176, F1 Macro: 0.8625106878464001, F1 Weighted: 0.8838961470917037





### Model M2

In [39]:
class Model2(nn.Module):
    def __init__(self, num_classes):
        super(Model2, self).__init__()
        self.embedding = nn.Embedding(50265, 768)
        self.gru = nn.GRU(768, 768, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(768*2, num_classes)

    def forward(self, input_ids, attention_mask):
        out = self.embedding(input_ids)
        out, _ = self.gru(out)
        out = out[:, 0, :]
        out = self.fc(out)
        return out

In [41]:
model2 = Model2(num_classes)
# loading the model on the device 
model2.load_state_dict(torch.load('/kaggle/input/nlpa4-withmodels/M2.pth', map_location=device))
model2.to(device)

Model2(
  (embedding): Embedding(50265, 768)
  (gru): GRU(768, 768, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1536, out_features=7, bias=True)
)

In [42]:
loss, f1_micro, f1_macro, f1_weighted = evaluate(model2, test_loader, criterion, num_classes)

print(f'Loss: {loss}, F1 Micro: {f1_micro}, F1 Macro: {f1_macro}, F1 Weighted: {f1_weighted}')

100%|██████████| 874/874 [00:20<00:00, 41.68it/s]

Loss: 1.1738981021487194, F1 Micro: 0.5394849785407725, F1 Macro: 0.33780431640347536, F1 Weighted: 0.4950421213811303



