In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score
from tqdm import tqdm, trange

In [2]:
train_data = pd.read_json('/kaggle/input/nlp-a4/train_file.json')
val_data = pd.read_json('/kaggle/input/nlp-a4/val_file.json')

In [3]:
print(len(train_data))
print(len(val_data))

6740
843


In [4]:
train_data.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_3492,"[Phoebe, Eric, Phoebe, Eric, Phoebe]","[surprise, fear, surprise, sadness, disgust]","[You-youyou had sex with Ursula?!, Uh, a litt...","[1.0, 1.0, 0.0, 0.0, 0.0]"
1,utterance_3952,"[Monica, Monica, Phoebe, Joey, Joey, Joey, Rac...","[disgust, disgust, anger, sadness, surprise, a...","[Dad, please don't pick your teeth out here!, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,utterance_3198,"[Older Scientist, Ross, Ross, Joey, Ross, Ross...","[neutral, neutral, neutral, neutral, neutral, ...","[Dr. Geller, there's a seat over here., Thank ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]"
3,utterance_2834,"[Monica, Monica, Monica]","[neutral, surprise, neutral]","[So, how'd the lasagne go over?, Really?!, Good.]","[0.0, 0.0, 1.0]"
4,utterance_453,"[Kate, The Director, Kate]","[joy, sadness, sadness]","[Become a drama critic!, I am hurt! A plague ...","[0.0, 0.0, 1.0]"


In [5]:
st = set()
for i in train_data['emotions']:
    for j in i:
        st.add(j)

In [6]:
print("Unique Emotions in Data:")
print(st)

Unique Emotions in Data:
{'fear', 'anger', 'sadness', 'disgust', 'surprise', 'neutral', 'joy'}


In [7]:
#join utterances with @ sign
train_data['utterances'] = train_data['utterances'].apply(lambda x: '@'.join(x))
train_data['speakers'] = train_data['speakers'].apply(lambda x: '@'.join(x))
val_data['utterances'] = val_data['utterances'].apply(lambda x: '@'.join(x))
val_data['speakers'] = val_data['speakers'].apply(lambda x: '@'.join(x))

In [8]:
# dropping duplicates 
train_data = train_data.drop_duplicates(subset=['speakers', 'utterances'], keep='first')
val_data = val_data.drop_duplicates(subset=['speakers', 'utterances'], keep='first')

In [9]:
train_data['utterances'] = train_data['utterances'].apply(lambda x: x.split('@'))
train_data['speakers'] = train_data['speakers'].apply(lambda x: x.split('@'))
val_data['utterances'] = val_data['utterances'].apply(lambda x: x.split('@'))
val_data['speakers'] = val_data['speakers'].apply(lambda x: x.split('@'))

In [10]:
# resetting index as we dropped rows
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [11]:
# print((train_data['utterances'][0]))
n = len(train_data['utterances'][0])
for i in range(n):
    print((train_data['utterances'][0][i]))

You-youyou had sex with Ursula?!
Uh, a little bit. She-she-she walked in and I thought she was you and I kissed her and
You didn't notice she was wearing different clothes?!
Well I was just so excited to see you.
Oh. Ew! Ew! Ew! Ugh! Y'know what? This is too weird.


In [12]:
unicode_mapping = {}
# unicode_mapping['\u0085'] = '...' 
# unicode_mapping['\u0091'] = "'"
# unicode_mapping['\u0092'] = "'"
# unicode_mapping['\u0093'] = '"'
# unicode_mapping['\u0094'] = '"'
# unicode_mapping['\u0097'] = '--'

# unicode_mapping['\u2014'] = '--'
# unicode_mapping['\u2019'] = "'"
# unicode_mapping['\u2026'] = '...'

# unicode_mapping['\u00e9'] = 'e'

unicode_mapping['\x85'] = '...' 
unicode_mapping['\x91'] = "'"
unicode_mapping['\x92'] = "'"
unicode_mapping['\x93'] = '"'
unicode_mapping['\x94'] = '"'
unicode_mapping['\x97'] = '--'

unicode_mapping['\u2014'] = '--'
unicode_mapping['\u2019'] = "'"
unicode_mapping['\u2026'] = '...'

unicode_mapping['\xe9'] = 'e'



In [13]:
# replacing unicode characters in the data
def clean_utterance(utterance_list):
    '''	
    This function takes a list of utterances and replaces the unicode with the proper characters.
    '''
    cleaned_utterances_list = []
    for utterance in utterance_list:
        for key in unicode_mapping:
            utterance = utterance.replace(key, unicode_mapping[key])
        cleaned_utterances_list.append(utterance)
    return cleaned_utterances_list

# clean the train data
n_train = len(train_data['utterances'])
for i in train_data.index:
    temp = train_data.loc[i].copy()
    cleaned_utterances = clean_utterance(temp['utterances'])
    temp['utterances'] = cleaned_utterances
    train_data.loc[i] = temp

# clean the val data
n_val = len(val_data['utterances'])
for i in val_data.index:
    temp = val_data.loc[i].copy()
    cleaned_utterances = clean_utterance(temp['utterances'])
    temp['utterances'] = cleaned_utterances
    val_data.loc[i] = temp

In [14]:
print(train_data['utterances'][0])
# print(len(train_data['utterances'][0]))

['You-you...you had sex with Ursula?!', 'Uh, a little bit. She-she-she walked in and I thought she was you and I kissed her and', "You didn't notice she was wearing different clothes?!", 'Well I was just so excited to see you.', "Oh. Ew! Ew! Ew! Ugh! Y'know what? This is too weird."]


In [15]:
train_data.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_3492,"[Phoebe, Eric, Phoebe, Eric, Phoebe]","[surprise, fear, surprise, sadness, disgust]","[You-you...you had sex with Ursula?!, Uh, a li...","[1.0, 1.0, 0.0, 0.0, 0.0]"
1,utterance_3952,"[Monica, Monica, Phoebe, Joey, Joey, Joey, Rac...","[disgust, disgust, anger, sadness, surprise, a...","[Dad, please don't pick your teeth out here!, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,utterance_3198,"[Older Scientist, Ross, Ross, Joey, Ross, Ross...","[neutral, neutral, neutral, neutral, neutral, ...","[Dr. Geller, there's a seat over here., Thank ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]"
3,utterance_2834,"[Monica, Monica, Monica]","[neutral, surprise, neutral]","[So, how'd the lasagne go over?, Really?!, Good.]","[0.0, 0.0, 1.0]"
4,utterance_453,"[Kate, The Director, Kate]","[joy, sadness, sadness]","[Become a drama critic!, I am hurt! A plague ...","[0.0, 0.0, 1.0]"


In [16]:
print(val_data.shape)

(808, 5)


In [17]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
class Model(nn.Module):
    #model consists of a EmoBERTa model and a linear layer for sequence labeling task
    def __init__(self, num_classes):
        super(Model, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)
        out = outputs[0]
        out = out[:, 0, :]
        out = self.fc(out)
        return out
    
def get_emotion_to_idx(data):
    emotion_to_idx = {}
    idx_to_emotion = {}
    idx = 0
    for i in data['emotions']:
        for j in i:
            if j not in emotion_to_idx:
                emotion_to_idx[j] = idx
                idx_to_emotion[idx] = j
                idx += 1
    return emotion_to_idx, idx_to_emotion

emotion_to_idx, idx_to_emotion = get_emotion_to_idx(train_data)
num_classes = len(emotion_to_idx)
print(num_classes)

7


In [19]:
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = self.process(data, tokenizer)
        self.max_len = max_len

    def __len__(self):
        return len(self.data['input'])

    def __getitem__(self, index):
        #tokenize the input
        input = self.data['input'][index]
        target = self.data['target'][index]
        encoding = self.tokenizer(input, return_tensors='pt', padding='max_length', max_length=self.max_len, truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'target': torch.tensor(target)}
        
    def process(self, data, tokenizer):
        new_data = {'input': [], 'target': []}
        for i in range(len(data)):
            newinput = data.loc[i].copy()
            stringlist = []
            for j in range(len(newinput['utterances'])):
                stringlist.append(newinput['speakers'][j] + ': ' + newinput['utterances'][j])
            for j in range(len(newinput['utterances'])):
                #join till jth utterance
                temp = ' '.join(stringlist[:j])
                temp += '</s></s>'
                #add jth utterance
                temp += newinput['speakers'][j] + ': ' + newinput['utterances'][j]
                temp += '</s></s>'
                #add later utterances
                temp += ' '.join(stringlist[j+1:])
                temp = '<s> ' + temp + '</s>'
                new_data['input'].append(temp)
                target = [0]*num_classes
                target[emotion_to_idx[newinput['emotions'][j]]] = 1.0
                new_data['target'].append(target)
        return new_data

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [21]:
def collate_fn(batch):
    input_ids = []
    attention_masks = []
    targets = []
    for i in batch:
        input_ids.append(i['input_ids'])
        attention_masks.append(i['attention_mask'])
        targets.append(i['target'])
    input_ids = torch.stack(input_ids, dim=0)
    attention_masks = torch.stack(attention_masks, dim=0)
    targets = torch.stack(targets, dim=0)
    return {
        'input_ids': input_ids,
        'attention_masks': attention_masks,
        'labels': targets
    }

In [22]:
train_dataset = Dataset(train_data, tokenizer, 128)
val_dataset = Dataset(val_data, tokenizer, 128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

model = Model(num_classes)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Setup Wandb

In [23]:
import wandb
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
model_config = {
    'model': 'RoBERTa with Linear Layer',
    'num_classes': num_classes,
    'learning_rate': 2e-5,
    'batch_size': 8,
    'optimizer': 'AdamW',
    'loss_function': 'CrossEntropyLoss'
}

In [25]:
wandb.init(project='assignment-4', entity='nlp-assignments', config=model_config)

[34m[1mwandb[0m: Currently logged in as: [33msahil21091[0m ([33mnlp-assignments[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
def train(model, train_loader, val_loader, criterion, optimizer, num_classes, num_epochs):
    wandb.define_metric('epoch')
    wandb.define_metric('train_epoch_loss', step_metric='epoch')
    wandb.define_metric('val_epoch_loss', step_metric='epoch')
    wandb.define_metric('f1_micro', step_metric='epoch')
    wandb.define_metric('f1_macro', step_metric='epoch')
    wandb.define_metric('f1_weighted', step_metric='epoch')
    train_losses = []
    val_losses = []
    # f1_micros = []
    # f1_macros = []
    # f1_weighteds = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        # avg_f1_micro = 0
        # avg_f1_macro = 0
        # avg_f1_weighted = 0
        #tqdm
        for i, data in enumerate(tqdm(train_loader)): 
            input_ids = data['input_ids'].to(device)
            attention_masks = data['attention_masks'].to(device)
            labels = data['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs.view(-1, num_classes), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        #     predictions = torch.argmax(outputs, dim=1)
        #     labels = torch.argmax(labels, dim=1)
        #     f1_micro = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
        #     f1_macro = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
        #     f1_weighted = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
        #     print(f1_micro, f1_macro, f1_weighted)
        #     avg_f1_micro += f1_micro
        #     avg_f1_macro += f1_macro
        #     avg_f1_weighted += f1_weighted
        # avg_f1_micro /= len(train_loader)
        # avg_f1_macro /= len(train_loader)
        # avg_f1_weighted /= len(train_loader)
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        # f1_micros.append(avg_f1_micro)
        # f1_macros.append(avg_f1_macro)
        # f1_weighteds.append(avg_f1_weighted)
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}')
        model.eval()
        val_loss = 0
        with torch.no_grad():
            predictions = []
            true_labels = []
            for i, data in enumerate(val_loader):
                input_ids = data['input_ids'].to(device)
                attention_masks = data['attention_masks'].to(device)
                labels = data['labels'].to(device) #labels are one-hot encoded
                outputs = model(input_ids, attention_masks)
                loss = criterion(outputs.view(-1, num_classes), labels)
                val_loss += loss.item()
                predictions.append(torch.argmax(outputs, dim=1))
                true_labels.append(torch.argmax(labels, dim=1))
            predictions = torch.cat(predictions, dim=0)
            true_labels = torch.cat(true_labels, dim=0)
            print(predictions)
            print(true_labels)
            f1_micro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
            f1_macro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
            f1_weighted = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
            val_loss /= len(val_loader)
            val_losses.append(val_loss)
            print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss}, F1 Micro: {f1_micro}, F1 Macro: {f1_macro}, F1 Weighted: {f1_weighted}')
        log_dict = {
            'epoch': epoch,
            'train_epoch_loss': train_loss,
            'val_epoch_loss': val_loss,
            'f1_micro': f1_micro,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted
        }
        wandb.log(log_dict)
    return train_losses, val_losses

In [27]:
train_losses, val_losses = train(model, train_loader, val_loader, criterion, optimizer, num_classes, 3)

100%|██████████| 4561/4561 [16:08<00:00,  4.71it/s]


Epoch 1/3, Train Loss: 1.0872886832529869
tensor([4, 5, 5,  ..., 5, 5, 5], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 1/3, Val Loss: 0.5882640423815808, F1 Micro: 0.8060085836909872, F1 Macro: 0.7560467800855564, F1 Weighted: 0.8014771204022083


100%|██████████| 4561/4561 [16:06<00:00,  4.72it/s]


Epoch 2/3, Train Loss: 0.5704681909864867
tensor([4, 5, 5,  ..., 3, 3, 3], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 2/3, Val Loss: 0.38271352033865014, F1 Micro: 0.8573676680972818, F1 Macro: 0.8326652452448212, F1 Weighted: 0.8558948710228961


100%|██████████| 4561/4561 [16:05<00:00,  4.73it/s]


Epoch 3/3, Train Loss: 0.42114014880508316
tensor([4, 5, 5,  ..., 5, 5, 5], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 3/3, Val Loss: 0.3063106948990125, F1 Micro: 0.884549356223176, F1 Macro: 0.8625106878464001, F1 Weighted: 0.8838961470917037


In [28]:
torch.save(model.state_dict(), 'M1.pth')

In [29]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
f1_macro,▁▆█
f1_micro,▁▆█
f1_weighted,▁▆█
train_epoch_loss,█▃▁
val_epoch_loss,█▃▁

0,1
epoch,2.0
f1_macro,0.86251
f1_micro,0.88455
f1_weighted,0.8839
train_epoch_loss,0.42114
val_epoch_loss,0.30631


In [38]:
 #Just one GRU followed by a linear layer
class Model2(nn.Module):
    def __init__(self, num_classes):
        super(Model2, self).__init__()
        self.embedding = nn.Embedding(50265, 768)
        self.gru = nn.GRU(768, 768, num_layers=2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(768*2, num_classes)

    def forward(self, input_ids, attention_mask):
        out = self.embedding(input_ids)
        out, _ = self.gru(out)
        out = out[:, 0, :]
        out = self.fc(out)
        return out

In [39]:
model2 = Model2(num_classes)
model2 = model2.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model2.parameters(), lr=2e-5)

In [40]:
model_config = {
    'model': 'GRU with Linear Layer',
    'num_classes': num_classes,
    'learning_rate': 2e-5,
    'batch_size': 8,
    'optimizer': 'AdamW',
    'loss_function': 'CrossEntropyLoss'
}

In [41]:
import wandb
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [42]:
wandb.init(project='assignment-4', entity='nlp-assignments', config=model_config)

In [43]:
train_losses, val_losses = train(model2, train_loader, val_loader, criterion, optimizer, num_classes, 3)

100%|██████████| 4561/4561 [07:29<00:00, 10.16it/s]


Epoch 1/3, Train Loss: 1.541118496030445
tensor([4, 4, 4,  ..., 5, 5, 5], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 1/3, Val Loss: 1.4468062862719768, F1 Micro: 0.45679542203147355, F1 Macro: 0.18730343284023301, F1 Weighted: 0.3540418260558057


100%|██████████| 4561/4561 [07:28<00:00, 10.17it/s]


Epoch 2/3, Train Loss: 1.3806476565508747
tensor([4, 5, 5,  ..., 5, 5, 5], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 2/3, Val Loss: 1.3018282090294553, F1 Micro: 0.49971387696709585, F1 Macro: 0.2703176481464467, F1 Weighted: 0.4329086756859499


100%|██████████| 4561/4561 [07:28<00:00, 10.17it/s]


Epoch 3/3, Train Loss: 1.2523062985812694
tensor([4, 5, 5,  ..., 5, 5, 5], device='cuda:0')
tensor([4, 5, 5,  ..., 3, 5, 0], device='cuda:0')
Epoch 3/3, Val Loss: 1.173898103683164, F1 Micro: 0.5394849785407725, F1 Macro: 0.33780431640347536, F1 Weighted: 0.4950421213811303


In [46]:
torch.save(model2.state_dict(), 'M2.pth')

In [45]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▅█
f1_macro,▁▅█
f1_micro,▁▅█
f1_weighted,▁▅█
train_epoch_loss,█▄▁
val_epoch_loss,█▄▁

0,1
epoch,2.0
f1_macro,0.3378
f1_micro,0.53948
f1_weighted,0.49504
train_epoch_loss,1.25231
val_epoch_loss,1.1739
