In [1]:
import torch
from torch import nn
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
# from sklearnex import patch_sklearn
# patch_sklearn()

import gc
import json
import pickle
import numpy as np    
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




In [2]:
PATH='data/'
OUTPATH='output/'
BATCH_SIZE=10
MAX_LENGTH=128
MAX_UTTERANCES=25
ROBERTA_LABELS=8
EPOCHS=2
EOS='</s>'
SEP='[SEP]'
SOS='<s>'

torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "roberta-base"

label_encoding ={
    'x':0,
    'surprise':1,
    'sadness':2,
    'anger':3,
    'fear':4,
    'disgust':5,
    'joy':6,
    'neutral':7
}

In [36]:
# {"episode":"utterance_3492","speakers":["Phoebe","Eric","Phoebe","Eric","Phoebe"],"emotions":["surprise","fear","surprise","sadness","disgust"],"utterances":["You-you\u0085you had sex with Ursula?!","Uh, a little bit. She-she-she walked in and I thought she was you and I kissed her and","You didn't notice she was wearing different clothes?!","Well I was just so excited to see you.","Oh. Ew! Ew! Ew! Ugh! Y'know what? This is too weird."],"triggers":[1.0,1.0,0.0,0.0,0.0]}

with open("data/train_file.json") as f:
    data = json.load(f)
    # to pandas
    df = pd.DataFrame(data)
#  df to list 
lst=df.values.tolist()
for i in range(len(lst)):
    for j in range(len(lst[i])):
        if type(lst[i][j])==list:
            lst[i][j]=type(lst[i][j])
s=set(lst)
print(len(s))

TypeError: unhashable type: 'list'

In [3]:
# read json
def get_data(file_path):
    global label_encoding
    with open(file_path) as f:
        data = json.load(f)
        # to pandas
        df = pd.DataFrame(data)

    sentence_len = []
    text=[]
    text_len=[]

    for i in range(len(df)):
        uterances=df.iloc[i]['utterances']
        speaker=df.iloc[i]['speakers']
        length=0
        sentences=[]
        for (speaker,uterance) in zip(speaker,uterances):
            sentence=speaker+': '+uterance
            sentences.append(sentence)
            length+=len(uterance.split())
        if(length==2):
            print(sentence)
            print(df.iloc[i])
        sentence_len.append(length)
        text_len.append(len(sentences))
        text.append(sentences)
        # print(length)
        # break
    y=[]
    for i in df["emotions"]:
        lst=[]
        for j in i:
            lst.append(label_encoding[j])
        if(len(lst)>MAX_UTTERANCES):
            lst=lst[:MAX_UTTERANCES]
        else:
            lst.extend([0]*(MAX_UTTERANCES-len(lst)))
        y.append(lst)
    x=[]
    for i in text:
        lst=[]
        for j in i:
            lst.append(j)
        if(len(lst)>MAX_UTTERANCES):
            lst=lst[:MAX_UTTERANCES]
        else:
            lst.extend([EOS]*(MAX_UTTERANCES-len(lst)))
        # print(lst)
        # print(len(lst))
        x.append(lst)
    
    return x[:100],y[:100]
    return x,y

In [4]:
x_train, y_train = get_data(PATH+"train_file.json")
x_val, y_val = get_data(PATH+'val_file.json')

Phoebe: No!
episode          utterance_915
speakers      [Phoebe, Phoebe]
emotions      [neutral, anger]
utterances          [No., No!]
triggers            [0.0, 0.0]
Name: 968, dtype: object
Phoebe: No!
episode          utterance_915
speakers      [Phoebe, Phoebe]
emotions      [neutral, anger]
utterances          [No., No!]
triggers            [0.0, 0.0]
Name: 3984, dtype: object


In [5]:
len(x_train),len(y_train),len(x_val),len(y_val)

(6740, 6740, 843, 843)

In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer,labels):
        self.data=data
        self.tokenizer = tokenizer
        self.labels = labels
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):   
        encoded_pair = self.tokenizer(self.data[idx],max_length=MAX_LENGTH,truncation=True,return_tensors="pt",padding="max_length")
        # print(encoded_pair)
        input_ids = encoded_pair['input_ids'].squeeze(0)
        attention_mask = encoded_pair['attention_mask'].squeeze(0)
        return input_ids,attention_mask,torch.tensor(self.labels[idx])

In [8]:
train_dataset = Dataset(x_train,tokenizer,y_train)
val_dataset = Dataset(x_val,tokenizer,y_val)

In [9]:
for i in train_dataset:
    for j in i:
        print(len(j))
    break

25
25
25


In [10]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=False)
val_dataloader=DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False)

Roberta output should be of size BATCH_SIZExlabels, we need linear layer to output BATCH_SIZExlabels as emotions in one way or another after we will do softmax. Now manage this. 

In [37]:

class EmotionClassifier(nn.Module):
    def __init__(self, roberta_model, roberta_labels,num_labels):
        super(EmotionClassifier, self).__init__()
        self.roberta_labels = roberta_labels
        self.num_labels = num_labels
        self.roberta = roberta_model
        self.roberta.requires_grad_(False)  
        for param in self.roberta.roberta.encoder.layer[-2:].parameters():
            param.requires_grad = True

        # self.linear = nn.Linear(MAX_UTTERANCES*roberta_labels, MAX_UTTERANCES*num_labels)
        # lstm
        self.lstm = nn.LSTM(input_size=roberta_labels, hidden_size=roberta_labels, num_layers=1, batch_first=True) 
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
        output_logits=outputs.logits # output: (50,10)
        batch_size = output_logits.shape[0]//MAX_UTTERANCES
        # print("OUTPUT: ",output_logits.shape, batch_size) 
        output_logits=output_logits.view(batch_size,MAX_UTTERANCES*self.roberta_labels) # (2,25*10)
        # print("OUTPUT: ",output_logits.shape) 
        # logits = self.linear(output_logits)
        logits=self.lstm(output_logits)
        logits=logits.view(batch_size,MAX_UTTERANCES,self.num_labels)
        # print("logits: ",logits.shape)
        softmax_output = nn.functional.softmax(logits, dim=-1)
        # print("softmax: ",softmax_output.shape)
        return softmax_output
    
num_labels = 8
config = AutoConfig.from_pretrained(model_id)
config.num_labels = ROBERTA_LABELS
roberta_model = RobertaForSequenceClassification.from_pretrained(model_id,config=config)
model = EmotionClassifier(roberta_model,ROBERTA_LABELS, num_labels)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.005)
criterion = nn.CrossEntropyLoss()


train_losses=[]
val_losses=[]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
def train_epoch(model, optimizer,epoch):
    model.train()
    losses = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch:{epoch}",total=len(train_dataloader), leave=False):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        # token_type_ids = batch[2].to(device)
        labels = batch[2].to(device)
        # print("train: ",labels.shape,input_ids.shape,attention_mask.shape)
        utterance_wise_batch={}
        batch_size=input_ids.shape[0]
        utterance_input_ids=input_ids.view(MAX_UTTERANCES*batch_size,-1)
        utterance_attention_mask=attention_mask.view(MAX_UTTERANCES*batch_size,-1)
        # for i in range(BATCH_SIZE):
        #     for j in range(MAX_UTTERANCES):
        #         utterance_input_ids.append(input_ids[i][j])
        #         utterance_attention_mask.append(attention_mask[i][j])
        # print(len(utterance_wise_batch),utterance_wise_batch[0])
        # utterance_input_ids=torch.stack(utterance_input_ids)
        # utterance_attention_mask=torch.stack(utterance_attention_mask)
        utterance_wise_batch['input_ids']=utterance_input_ids
        utterance_wise_batch['attention_mask']=utterance_attention_mask
        # print(utterance_input_ids.shape,utterance_attention_mask.shape,len(utterance_wise_batch))
        optimizer.zero_grad()
        predicted = model(utterance_input_ids, utterance_attention_mask)
        # print("lables: ",labels.shape)
        predicted=predicted.view(-1,num_labels)
        labels=labels.view(-1)
        # print("train: ",predicted.shape,labels.shape)
        predicted_labels=predicted.argmax(dim=-1)
        # print(predicted_labels,labels)
        loss = criterion(predicted, labels)
        losses += loss.item()
        loss.backward()
        optimizer.step()
        del input_ids
        del attention_mask
        del labels
        del predicted
        del utterance_input_ids
        del utterance_attention_mask
        del utterance_wise_batch
        gc.collect()
        torch.cuda.empty_cache()
        break
    x = losses / len(list(train_dataloader))
    train_losses.append(x)
#     wandb.log({'epoch':epoch,'train_loss':x})
    tqdm.write(f"Epoch:{epoch}, Avg Train Loss: {x}")
    gc.collect()
    torch.cuda.empty_cache()
    return x



def evaluate(model,val_dataloader,name):
    with torch.no_grad():
        model.eval()
        losses = 0

        all_labels = []
        for batch in tqdm(val_dataloader, desc=name,total=len(val_dataloader), leave=False):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            utterance_wise_batch={}
            batch_size=input_ids.shape[0]
            utterance_input_ids=input_ids.view(MAX_UTTERANCES*batch_size,-1)
            utterance_attention_mask=attention_mask.view(MAX_UTTERANCES*batch_size,-1)
            utterance_wise_batch['input_ids']=utterance_input_ids
            utterance_wise_batch['attention_mask']=utterance_attention_mask
            predicted = model(utterance_input_ids, utterance_attention_mask)
            predicted=predicted.view(-1,num_labels)
            labels=labels.view(-1)
            loss = criterion(predicted, labels)
            losses += loss.item()
            predicted_labels = predicted.argmax(dim=-1)
            all_labels.extend(predicted_labels.cpu().numpy())
            del input_ids
            del attention_mask
            del labels
            del predicted
            del utterance_input_ids
            del utterance_attention_mask
            del utterance_wise_batch
            gc.collect()
            torch.cuda.empty_cache()
            # break
        x = losses / len(list(val_dataloader))
        val_losses.append(x)
        tqdm.write(f"Avg {name} Loss: {x}")
        gc.collect()
        torch.cuda.empty_cache()
        return x, all_labels
    



In [39]:
gc.collect()
torch.cuda.empty_cache()
for epoch in range(1, EPOCHS+1):
    train_loss = train_epoch(model, optimizer,epoch)
    val_loss,all_labels = evaluate(model,val_dataloader=val_dataloader,name='Val')
    if(epoch%1==0):
        torch.save(model, f"{OUTPATH}modelM1_epoch{epoch}.pth")
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}"))

Epoch:1:   0%|          | 0/674 [00:00<?, ?it/s]

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


RuntimeError: shape '[1000000, 1]' is invalid for input of size 40000

In [None]:
torch.save(model, f"{OUTPATH}modelM1.pth")
torch.save(tokenizer, f"{OUTPATH}tokenizerM1.pth")

In [None]:
# load model
loaded_model = torch.load(f"{OUTPATH}modelM1.pth")
loaded_tokenizer = torch.load(f"{OUTPATH}tokenizerM1.pth")

# test
x_test, y_test = get_data(PATH+'val_file.json')
test_dataset = Dataset(x_test,loaded_tokenizer,y_test)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=False)
test_loss,all_labels = evaluate(loaded_model,test_dataloader,name='Test')


Test:   0%|          | 0/85 [00:00<?, ?it/s]

Avg Test Loss: 1.5519384145736694


In [None]:
y_test_list = np.array(y_test).flatten()

In [None]:
len(y_test_list),np.array(all_labels).shape,np.array(y_test).shape

(21075, (21075,), (843, 25))

In [None]:
dic1={}
dic2={}
count1=0
for i in y_test_list:
    count1+=1
    if i in dic1:
        dic1[i]+=1
    else:
        dic1[i]=1

count2=0
for i in all_labels:
    count2+=1
    if i in dic2:
        dic2[i]+=1
    else:
        dic2[i]=1

dic1,dic2,count1,count2

({3: 788, 7: 3200, 1: 1008, 5: 215, 4: 265, 2: 558, 0: 13782, 6: 1259},
 {7: 4215, 0: 16860},
 21075,
 21075)

In [None]:
print(f"Test loss: {test_loss:.3f}")
f1_sccore = f1_score(y_test_list, all_labels, average='weighted')
f1_macro = f1_score(y_test_list, all_labels, average='macro')
print(f1_macro,f1_sccore)
print(classification_report(y_test_list, all_labels))

Test loss: 1.552
0.1687305441920074 0.6465387859620123
              precision    recall  f1-score   support

           0       0.80      0.98      0.88     13782
           1       0.00      0.00      0.00      1008
           2       0.00      0.00      0.00       558
           3       0.00      0.00      0.00       788
           4       0.00      0.00      0.00       265
           5       0.00      0.00      0.00       215
           6       0.00      0.00      0.00      1259
           7       0.41      0.55      0.47      3200

    accuracy                           0.72     21075
   macro avg       0.15      0.19      0.17     21075
weighted avg       0.59      0.72      0.65     21075



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
