In [20]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
# from sklearnex import patch_sklearn
# patch_sklearn()

import gc
import json
import pickle
import numpy as np    
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [2]:
PATH='D:\\ghd\\NLP-Assignments\\Assignment4\\data\\'
OUTPATH='D:\\ghd\\NLP-Assignments\\Assignment4\\output\\'
BATCH_SIZE=4
MAX_LENGTH=128
MAX_UTTERANCES=25
EPOCHS=1
EOS='</s>'
SEP='[SEP]'
SOS='<s>'

torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "roberta-base"

label_encoding ={
    'x':0,
    'surprise':1,
    'sadness':2,
    'anger':3,
    'fear':4,
    'disgust':5,
    'joy':6,
    'neutral':7
}

In [3]:
# read json
def get_data(file_path):
    global label_encoding
    with open(file_path) as f:
        data = json.load(f)
        # to pandas
        df = pd.DataFrame(data)

    sentence_len = []
    text=[]
    text_len=[]

    for i in range(len(df)):
        uterances=df.iloc[i]['utterances']
        speaker=df.iloc[i]['speakers']
        length=0
        sentences=[]
        for (speaker,uterance) in zip(speaker,uterances):
            sentence=speaker+': '+uterance
            sentences.append(sentence)
            length+=len(uterance.split())
        if(length==2):
            print(sentence)
            print(df.iloc[i])
        sentence_len.append(length)
        text_len.append(len(sentences))
        text.append(sentences)
        # print(length)
        # break
    y=[]
    for i in df["emotions"]:
        lst=[]
        for j in i:
            lst.append(label_encoding[j])
        if(len(lst)>MAX_UTTERANCES):
            lst=lst[:MAX_UTTERANCES]
        else:
            lst.extend([0]*(MAX_UTTERANCES-len(lst)))
        y.append(lst)
    x=[]
    for i in text:
        lst=[]
        for j in i:
            lst.append(j)
        if(len(lst)>MAX_UTTERANCES):
            lst=lst[:MAX_UTTERANCES]
        else:
            lst.extend([EOS]*(MAX_UTTERANCES-len(lst)))
        # print(lst)
        # print(len(lst))
        x.append(lst)

    return x[:100],y[:100]

In [4]:
x_train, y_train = get_data(PATH+'train_file.json')
x_val, y_val = get_data(PATH+'val_file.json')

Phoebe: No!
episode          utterance_915
speakers      [Phoebe, Phoebe]
emotions      [neutral, anger]
utterances          [No., No!]
triggers            [0.0, 0.0]
Name: 968, dtype: object
Phoebe: No!
episode          utterance_915
speakers      [Phoebe, Phoebe]
emotions      [neutral, anger]
utterances          [No., No!]
triggers            [0.0, 0.0]
Name: 3984, dtype: object


In [5]:
ii=17
x_train[ii],y_train[ii]

(['Rachel: They made you head of the department!',
  "Ross: No, I get to teach one of his advanced classes!  Why didn't I get head of the department?",
  'Joey: Oh! Hey Rach, listen umm',
  'Rachel: Yeah.',
  'Joey: I got a big date coming up, do you know a good restaurant?',
  "Rachel: Uh, Paul's Café. They got great food and it's really romantic.",
  'Joey: Ooh, great! Thanks!',
  'Rachel: Yeah! Oh, and then afterwards you can take her to the',
  "Joey: You sure are naming a lot of ways to postpone sex, I'll tell ya",
  'Rachel: Ooh, I miss dating.',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>'],
 [6, 6, 7, 7, 7, 7, 6, 7, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [6]:
from torch import nn
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer,labels):
        self.data=data
        self.tokenizer = tokenizer
        self.labels = labels
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):   
        encoded_pair = self.tokenizer(self.data[idx],max_length=MAX_LENGTH,truncation=True,return_tensors="pt",padding="max_length")
        # print(encoded_pair)
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        return input_ids,attention_mask,torch.tensor(self.labels[idx])

In [8]:
train_dataset = Dataset(x_train,tokenizer,y_train)
val_dataset = Dataset(x_val,tokenizer,y_val)

In [9]:
for i in train_dataset:
    for j in i:
        print(len(j))
    break

25
25
25


In [10]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=False)
val_dataloader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False)

Roberta output should be of size BATCH_SIZExlabels, we need linear layer to output BATCH_SIZExlabels as emotions in one way or another after we will do softmax. Now manage this. 

In [11]:

class EmotionClassifier(nn.Module):
    def __init__(self, roberta_model, roberta_labels,num_labels):
        super(EmotionClassifier, self).__init__()
        self.roberta_labels = roberta_labels
        self.num_labels = num_labels
        self.roberta = roberta_model
        self.roberta.requires_grad_(False)  
        for param in self.roberta.roberta.encoder.layer[-3:].parameters():
            param.requires_grad = True

        self.linear = nn.Linear(MAX_UTTERANCES*roberta_labels, MAX_UTTERANCES*num_labels) 
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
        output_logits=outputs.logits # output: (50,10)
        output_logits=output_logits.view(BATCH_SIZE,MAX_UTTERANCES*self.roberta_labels) # (2,25*10)
        # print("OUTPUT: ",output_logits.shape) # now working correctly
        logits = self.linear(output_logits)
        logits=logits.view(BATCH_SIZE,MAX_UTTERANCES,self.num_labels)
        # print("logits: ",logits.shape)
        softmax_output = nn.functional.softmax(logits, dim=-1)
        # print("softmax: ",softmax_output.shape)
        return softmax_output
    
num_labels = 8
roberta_labels = 10
config = AutoConfig.from_pretrained(model_id)
config.num_labels = roberta_labels
roberta_model = RobertaForSequenceClassification.from_pretrained(model_id,config=config)
model = EmotionClassifier(roberta_model,roberta_labels, num_labels)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.005)
criterion = nn.CrossEntropyLoss()


train_losses=[]
val_losses=[]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:


def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        # token_type_ids = batch[2].to(device)
        labels = batch[2].to(device)
        # print("train: ",labels.shape,input_ids.shape,attention_mask.shape)
        utterance_wise_batch={}
        utterance_input_ids=input_ids.view(MAX_UTTERANCES*BATCH_SIZE,-1)
        utterance_attention_mask=attention_mask.view(MAX_UTTERANCES*BATCH_SIZE,-1)
        # for i in range(BATCH_SIZE):
        #     for j in range(MAX_UTTERANCES):
        #         utterance_input_ids.append(input_ids[i][j])
        #         utterance_attention_mask.append(attention_mask[i][j])
        # print(len(utterance_wise_batch),utterance_wise_batch[0])
        # utterance_input_ids=torch.stack(utterance_input_ids)
        # utterance_attention_mask=torch.stack(utterance_attention_mask)
        utterance_wise_batch['input_ids']=utterance_input_ids
        utterance_wise_batch['attention_mask']=utterance_attention_mask
        # print(utterance_input_ids.shape,utterance_attention_mask.shape,len(utterance_wise_batch))
        optimizer.zero_grad()
        predicted = model(utterance_input_ids, utterance_attention_mask)
        # print("lables: ",labels.shape)
        predicted=predicted.view(-1,num_labels)
        labels=labels.view(-1)
        loss = criterion(predicted, labels)
        losses += loss.item()
        loss.backward()
        optimizer.step()
        del input_ids
        del attention_mask
        del labels
        del predicted
        del utterance_input_ids
        del utterance_attention_mask
        del utterance_wise_batch
        gc.collect()
        torch.cuda.empty_cache()
        # break
    x = losses / len(list(train_dataloader))
    train_losses.append(x)
#     wandb.log({'epoch':epoch,'train_loss':x})
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x



def evaluate(model,val_dataloader,name):
    with torch.no_grad():
        model.eval()
        losses = 0

        all_labels = []
        for batch in tqdm(val_dataloader, desc=name,total=len(val_dataloader), leave=False):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            utterance_wise_batch={}
            utterance_input_ids=input_ids.view(MAX_UTTERANCES*BATCH_SIZE,-1)
            utterance_attention_mask=attention_mask.view(MAX_UTTERANCES*BATCH_SIZE,-1)
            utterance_wise_batch['input_ids']=utterance_input_ids
            utterance_wise_batch['attention_mask']=utterance_attention_mask
            predicted = model(utterance_input_ids, utterance_attention_mask)
            predicted=predicted.view(-1,num_labels)
            labels=labels.view(-1)
            loss = criterion(predicted, labels)
            losses += loss.item()
            predicted_labels = predicted.argmax(dim=-1)
            all_labels.extend(predicted_labels.cpu().numpy())
            del input_ids
            del attention_mask
            del labels
            del predicted
            del utterance_input_ids
            del utterance_attention_mask
            del utterance_wise_batch
            gc.collect()
            torch.cuda.empty_cache()
            # break
        x = losses / len(list(val_dataloader))
        val_losses.append(x)
        tqdm.write(f"Avg {name} Loss: {x}")
        gc.collect()
        torch.cuda.empty_cache()
        return x, all_labels
    



In [17]:
gc.collect()
torch.cuda.empty_cache()
for epoch in range(1, EPOCHS+1):
    train_loss = train_epoch(model, optimizer,epoch)
    val_loss,all_labels = evaluate(model,val_dataloader=val_dataloader,name='Val')
    torch.save(model, f"{OUTPATH}modelM1_epoch{epoch}.pth")
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}"))

                                                        

Epoch:1, Avg Train Loss: 1.529209361076355


                                                    

Avg Val Loss: 1.5720089387893676
Epoch: 1, Train loss: 1.529, Val loss: 1.572


In [18]:
torch.save(model, f"{OUTPATH}modelM1.pth")
torch.save(tokenizer, f"{OUTPATH}tokenizerM1.pth")

In [24]:
# load model
loaded_model = torch.load(f"{OUTPATH}modelM1.pth")
loaded_tokenizer = torch.load(f"{OUTPATH}tokenizerM1.pth")

# test
x_test, y_test = get_data(PATH+'val_file.json')
test_dataset = Dataset(x_test,loaded_tokenizer,y_test)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False)
test_loss,all_labels = evaluate(loaded_model,test_dataloader,name='Test')


                                                     

Avg Test Loss: 1.5720089387893676


In [25]:

print(f"Test loss: {test_loss:.3f}")
f1_sccore = f1_score(y_test, all_labels, average='weighted')
f1_macro = f1_score(y_test, all_labels, average='macro')
print(f1_macro,f1_sccore)
print(classification_report(y_test, all_labels))

Test loss: 1.572


ValueError: Found input variables with inconsistent numbers of samples: [100, 2500]